In [4]:
%load_ext autoreload
%autoreload 2

import re
import os
import sys
import nltk
import spacy
import skweak
import numpy as np
import pandas as pd

from utils.skweak_ner_eval_utils import evaluate

from textblob import TextBlob
from textblob.taggers import PatternTagger

from part_3_eng_pos_tags.scripts.utils import penntreebank2universal

# from skweak import heuristics, gazetteers, aggregation, utils

sys.path.append('../')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [123]:
!python --version
#!pip install textblob
!pip freeze > requirements_test.txt
# TODO: add textblob==0.17.1 to 


nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('stopwords')
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
! python -m textblob.download_corpora

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/andst/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/andst/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/andst/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [None]:
# Weakly Supervised Named Entity Tagging with Learnable Logical Rules
# https://universaldependencies.org/format.html
# https://aclanthology.org/2021.acl-long.352.pdf

# Get data from https://github.com/explosion/projects/tree/v3/benchmarks/ud_benchmark
# by using assets command, or downloading https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/ud-treebanks-v2.5.tgz




## Using Libraries as Labelling functions


In this part, we use popular NLP libraries to create labeling functions. They include Spacy, NLTK, Textblob.
We use the Majority Voter and HMM as aggregation functions
Optionally, you can train your own model on the data.

Learning goals:
- Understand how to use external libraries as labeling functions
- Understand the Spacy object and how to use it for annotation
- Understand the impact of Majority Voter and HMM as aggregation functions, and get a feeling for their performance

First, read and understand the two functions below.

In [6]:

# Sometimes data formats (here POS tags) differ. We load the data and convert it to the format we need. 
# Surely, there is some loss of information
def nltk_tagger(doc):
    for token in doc:
        if not token.is_punct:
            # Tag token with nltk
            nltk_pos = nltk.pos_tag([token.text])[0][1]
            # Map nltk pos tags to ours
            if nltk_pos == "DT":
                yield token.i, token.i + 1, "DET"
            elif nltk_pos == "CD":
                yield token.i, token.i + 1, "NUM"
            elif nltk_pos == "NNP" or nltk_pos == "NNPS":
                yield token.i, token.i + 1, "PROPN"
            elif nltk_pos == "JJ" or nltk_pos == "JJR" or nltk_pos == "JJS":
                yield token.i, token.i + 1, "ADJ"
            elif nltk_pos == "NN" or nltk_pos == "NNS":
                yield token.i, token.i + 1, "NOUN"
            elif nltk_pos == "VB" or nltk_pos == "VBD" or nltk_pos == "VBG" or nltk_pos == "VBN" or nltk_pos == "VBP" or nltk_pos == "VBZ":
                yield token.i, token.i + 1, "VERB"


# We cn also use the Textblob library to get POS tags
# Under the hood, it uses the Pattern library. Once again, a transformation of the tag-labels is needed
def textblob_tagger(doc):
    for token in doc:
        if not token.is_punct:
            textblob_pos = TextBlob(token.text, pos_tagger=PatternTagger()).tags
            if len(textblob_pos) > 0:
                yield token.i, token.i + 1, penntreebank2universal(textblob_pos[0][1])


## Write the Spacy Labeling Functions

Use the two english Spacy models "en_core_web_sm", "en_core_web_md" to create labeling functions.
The challenge is that they use different tokens, i.e. the atomic units of a sentence. Our simple tokenization just splits the words by whitespace.
Your task it to design an algorithm that maps the tokens of the simple tokenization to the tokens of the Spacy tokenization, and use the token available there to create labeling functions.

Hints:
1) Access token i by `token=doc[i]` or obtain its poition by `i=token.i`
2) Access the Spacy POS token by `pos=token.pos_`

In [None]:
eng_nlp_sm = spacy.load("en_core_web_sm")
eng_nlp_md = spacy.load("en_core_web_md")


def eng_spacy_tagger_sm(doc):
    other_doc = eng_nlp_sm(doc.text)
    i = 0
    for token in doc:
        labelled = False
        for other_token in other_doc:
            if other_doc[other_token.i:].text not in doc[token.i:].text:
                continue
            if token.text in other_token.text and not labelled:
                labelled = True
                yield token.i, token.i + 1, other_token.pos_.split("-")[-1]


def eng_spacy_tagger_md(doc):
    other_doc = eng_nlp_md(doc.text)

    for token in doc:
        labelled = False
        for other_token in other_doc:
            if other_doc[other_token.i:].text not in doc[token.i:].text:
                continue
            if token.text in other_token.text and not labelled:
                labelled = True
                yield token.i, token.i + 1, other_token.pos_.split("-")[-1]

In [None]:

nltk_lf = skweak.heuristics.FunctionAnnotator("nltk1", nltk_tagger)
textblob_lf = skweak.heuristics.FunctionAnnotator("textblob", textblob_tagger)
eng_spacy_sm_lf = skweak.heuristics.FunctionAnnotator("eng_spacy_sm", eng_spacy_tagger_sm)
eng_spacy_md_lf = skweak.heuristics.FunctionAnnotator("eng_spacy_md", eng_spacy_tagger_md)

In [63]:

def compute_recall_num_conflicts(docs):
    recalls, num_conflicts = [], []

    for doc in docs:

        recalls, num_conflicts = [], []
        doc_conflicts = {}
        for name, val in doc.spans.items():
            for v in val:
                for i in range(v.start, v.end):
                    if i in doc_conflicts:
                        doc_conflicts[i].append(v.label)
                    else:
                        doc_conflicts[i] = [v.label]

        doc_recall = len(doc_conflicts) / len(doc)
        doc_num_conflicts = np.where([len(set(v)) > 1 for v in doc_conflicts.values()])[0]
        doc_num_conflicts = len(doc_num_conflicts) / len(doc_conflicts) if len(doc_conflicts) > 0 else 0

        recalls.append(doc_recall)
        num_conflicts.append(doc_num_conflicts)

    recall = np.mean(recalls)
    num_conflicts = np.mean(num_conflicts)
    return recall, num_conflicts

In [64]:
recall, num_conflicts = compute_recall_num_conflicts(train_docs)
print(recall, num_conflicts)
recall, num_conflicts = compute_recall_num_conflicts(test_docs)
print(recall, num_conflicts)

1.0 0.2692307692307692
1.0 0.45


In [46]:
print(recall, num_conflicts)

1.0 0.4482758620689655


In [40]:
print(conflicts)

{0: ['', '', '', ''], 2: ['', '', '', ''], 4: ['ADJ', 'ADJ', 'ADJ', 'ADJ'], 5: ['NOUN', 'NOUN', 'NOUN', 'NOUN'], 6: ['', '', '', ''], 7: ['', '', '', ''], 8: ['', '', '', ''], 9: ['', '', '', ''], 11: ['', '', '', ''], 13: ['DET', 'DET', 'DET', 'DET'], 14: ['NOUN', 'NOUN', 'NOUN', 'NOUN'], 16: ['DET', 'DET', 'DET', 'DET'], 17: ['NOUN', 'NOUN', 'NOUN', 'NOUN'], 19: ['DET', 'DET', 'DET', 'DET'], 20: ['NOUN', 'NOUN', 'NOUN', 'NOUN'], 22: ['', '', '', ''], 25: ['DET', 'DET', 'DET', 'DET'], 26: ['ADJ', 'ADJ', 'ADJ', 'ADJ'], 27: ['NOUN', 'NOUN', 'NOUN', 'NOUN'], 1: ['', ''], 3: ['', ''], 10: ['', ''], 12: ['', ''], 15: ['', '', ''], 18: ['', '', ''], 21: ['', '', ''], 23: ['', ''], 24: ['', '', ''], 28: ['', '']}


### Load Data and apply Labeling functions



In [68]:
from part_3_eng_pos_tags.scripts.utils import load_data_split, tag_all

lfs = [nltk_lf_2, eng_spacy_sm_lf, textblob_lf, eng_spacy_md_lf]

# load training and test data
all_labels = ["DET", "NUM", "PROPN", "NOUN", "ADJ"]
train_docs = load_data_split("train", all_labels, 3000)
test_docs = load_data_split("test", all_labels, 1000)

# tag the training documents
train_docs = tag_all(train_docs, lfs)

# train the HMM
mv = skweak.aggregation.MajorityVoter("mv", all_labels)
hmm = skweak.aggregation.HMM("hmm", all_labels)
hmm.fit(train_docs)

# tag the test documents
# it's important to set Majority vote before HMM, otherwise Majority Vote takes the HMM predictions into account
test_docs = tag_all(test_docs, lfs + [mv, hmm])


Starting iteration 1
Number of processed documents: 1000
Number of processed documents: 2000
Finished E-step with 3000 documents
Starting iteration 2


         1     -105487.6510             +nan


Number of processed documents: 1000
Number of processed documents: 2000
Finished E-step with 3000 documents
Starting iteration 3


         2     -100521.0518       +4966.5992


Number of processed documents: 1000
Number of processed documents: 2000
Finished E-step with 3000 documents
Starting iteration 4


         3      -99870.4155        +650.6362


Number of processed documents: 1000
Number of processed documents: 2000
Finished E-step with 3000 documents
Starting iteration 5


         4      -99584.3244        +286.0912


Number of processed documents: 1000
Number of processed documents: 2000
Finished E-step with 3000 documents


         5      -99350.4815        +233.8429


In [None]:
# conflicts per sentence?

In [70]:
df = evaluate(test_docs, all_labels, ["nltk2", "eng_spacy_sm", "eng_spacy_md", "textblob", "mv", "hmm"])

## Evaluation

Look at the Precision, Recall and F1-Score of the different aggregation functions. What do you observe?

In [71]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,12.8 %,eng_spacy_md,0.925,0.852,0.888,,,,0.925,0.852,0.888
ADJ,12.8 %,eng_spacy_sm,0.921,0.852,0.886,,,,0.921,0.852,0.886
ADJ,12.8 %,hmm,0.912,0.866,0.888,,,,0.912,0.866,0.888
ADJ,12.8 %,mv,0.893,0.852,0.872,,,,0.893,0.852,0.872
ADJ,12.8 %,nltk2,0.793,0.595,0.68,,,,0.793,0.595,0.68
ADJ,12.8 %,textblob,0.835,0.825,0.83,,,,0.835,0.825,0.83
DET,17.3 %,eng_spacy_md,0.997,0.98,0.988,,,,0.997,0.98,0.988
DET,17.3 %,eng_spacy_sm,0.996,0.979,0.988,,,,0.996,0.979,0.988
DET,17.3 %,hmm,0.95,0.996,0.972,,,,0.95,0.996,0.972
DET,17.3 %,mv,0.9,0.999,0.946,,,,0.9,0.999,0.946


In [74]:
df[np.in1d(df.index.get_level_values(2), ['mv', "hmm"])]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,12.8 %,hmm,0.912,0.866,0.888,,,,0.912,0.866,0.888
ADJ,12.8 %,mv,0.893,0.852,0.872,,,,0.893,0.852,0.872
DET,17.3 %,hmm,0.95,0.996,0.972,,,,0.95,0.996,0.972
DET,17.3 %,mv,0.9,0.999,0.946,,,,0.9,0.999,0.946
NOUN,37.4 %,hmm,0.936,0.859,0.896,,,,0.936,0.859,0.896
NOUN,37.4 %,mv,0.671,0.879,0.762,,,,0.671,0.879,0.762
NUM,7.3 %,hmm,0.964,0.927,0.946,,,,0.964,0.927,0.946
NUM,7.3 %,mv,0.969,0.848,0.904,,,,0.969,0.848,0.904
PROPN,25.2 %,hmm,0.806,0.95,0.872,,,,0.806,0.95,0.872
PROPN,25.2 %,mv,0.808,0.947,0.872,,,,0.808,0.947,0.872
