In [None]:
%load_ext autoreload
%autoreload 2

import nltk
import spacy
import skweak

from textblob import TextBlob
from textblob.taggers import PatternTagger

from scripts.skweak_ner_eval import evaluate
from scripts.utils import penntreebank2universal, load_data_split, tag_all, compute_recall, compute_num_conflicts


In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('stopwords')
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md
!python -m textblob.download_corpora

## Using Libraries as Labelling functions


In this part, we use popular NLP libraries to create labeling functions. They include Spacy, NLTK, Textblob.
We use the Majority Voter and HMM as aggregation functions
Optionally, you can train your own model on the data.

Learning goals:
- Understand how to use external libraries as labeling functions
- Understand the Spacy object and how to use it for annotation
- Understand the impact of Majority Voter and HMM as aggregation functions, and get a feeling for their performance

First, read and understand the two functions below.

In [None]:

# Sometimes data formats (here POS tags) differ. We load the data and convert it to the format we need. 
# Surely, there is some loss of information
def nltk_tagger(doc):
    for token in doc:
        if not token.is_punct:
            # Tag token with nltk
            nltk_pos = nltk.pos_tag([token.text])[0][1]
            # Map nltk pos tags to ours
            if nltk_pos == "DT":
                yield token.i, token.i + 1, "DET"
            elif nltk_pos == "CD":
                yield token.i, token.i + 1, "NUM"
            elif nltk_pos == "NNP" or nltk_pos == "NNPS":
                yield token.i, token.i + 1, "PROPN"
            elif nltk_pos == "JJ" or nltk_pos == "JJR" or nltk_pos == "JJS":
                yield token.i, token.i + 1, "ADJ"
            elif nltk_pos == "NN" or nltk_pos == "NNS":
                yield token.i, token.i + 1, "NOUN"
            elif nltk_pos == "VB" or nltk_pos == "VBD" or nltk_pos == "VBG" or nltk_pos == "VBN" or nltk_pos == "VBP" or nltk_pos == "VBZ":
                yield token.i, token.i + 1, "VERB"


# We cn also use the Textblob library to get POS tags
# Under the hood, it uses the Pattern library. Once again, a transformation of the tag-labels is needed
def textblob_tagger(doc):
    for token in doc:
        if not token.is_punct:
            textblob_pos = TextBlob(token.text, pos_tagger=PatternTagger()).tags
            if len(textblob_pos) > 0:
                yield token.i, token.i + 1, penntreebank2universal(textblob_pos[0][1])


## Write the Spacy Labeling Functions

Use the two english Spacy models "en_core_web_sm", "en_core_web_md" to create labeling functions.
The challenge is that they use different tokens, i.e. the atomic units of a sentence. Our simple tokenization just splits the words by whitespace.
Your task it to design an algorithm that maps the tokens of the simple tokenization to the tokens of the Spacy tokenization, and use the token available there to create labeling functions.

Hints:
1) Access token i by `token=doc[i]` or obtain its poition by `i=token.i`
2) Access the Spacy POS token (its ground truth) by `pos=token.pos_`

In [None]:
eng_nlp_sm = spacy.load("en_core_web_sm")
eng_nlp_md = spacy.load("en_core_web_md")

#########################################################

def eng_spacy_tagger_sm(doc):
    other_doc = eng_nlp_sm(doc.text)
    i = 0
    for token in doc:
        labelled = False
        for other_token in other_doc:
            if other_doc[other_token.i:].text not in doc[token.i:].text:
                continue
            if token.text in other_token.text and not labelled:
                labelled = True
                yield token.i, token.i + 1, other_token.pos_.split("-")[-1]


def eng_spacy_tagger_md(doc):
    other_doc = eng_nlp_md(doc.text)

    for token in doc:
        labelled = False
        for other_token in other_doc:
            if other_doc[other_token.i:].text not in doc[token.i:].text:
                continue
            if token.text in other_token.text and not labelled:
                labelled = True
                yield token.i, token.i + 1, other_token.pos_.split("-")[-1]

#########################################################

In [None]:

nltk_lf = skweak.heuristics.FunctionAnnotator("nltk", nltk_tagger)
textblob_lf = skweak.heuristics.FunctionAnnotator("textblob", textblob_tagger)
eng_spacy_sm_lf = skweak.heuristics.FunctionAnnotator("eng_spacy_sm", eng_spacy_tagger_sm)
eng_spacy_md_lf = skweak.heuristics.FunctionAnnotator("eng_spacy_md", eng_spacy_tagger_md)

### Load Data and apply Labeling functions

Before and after applying the labeling functions, and the aggregation functions, we compute the recall and number of conflicts. For the sake of time, we use this time only a subset of the data.

In [None]:

# load training and test data
lfs = [nltk_lf, eng_spacy_sm_lf, textblob_lf, eng_spacy_md_lf]
all_labels = ["DET", "NUM", "PROPN", "NOUN", "ADJ"]

# small amount of data for the sake of time
train_docs = load_data_split("train", all_labels, 3000)

# tag the training documents
train_docs = tag_all(train_docs, lfs)



In [None]:
recall = compute_recall(train_docs)
num_conflicts = compute_num_conflicts(train_docs)
print("Train recall", recall)
print("Train conflicts", num_conflicts)

We observe that the recall is not very high. This is because the libraries are working quite well. 
Further, we observe that in 40.5% of the tokens there is a conflict.

In [None]:
# train the HMM
hmm = skweak.aggregation.HMM("hmm", all_labels)
hmm=hmm.fit(train_docs)

# tag the test documents
# it's important to set Majority vote before HMM, otherwise Majority Vote takes the HMM predictions into account

Now we compare how majority vote and HMM change the number of conflicts.

In [None]:
mv = skweak.aggregation.MajorityVoter("mv", all_labels)
train_docs = tag_all(train_docs, [mv, hmm])

num_conflicts = compute_num_conflicts(train_docs)
print("Conflicts with MV on train set: ", num_conflicts)

We observe that the number of token conflicts does not change. The reason is that both methods can not choose a class different from the labeling functions.

## Evaluation

Look at the Precision, Recall and F1-Score of the different aggregation functions. What do you observe?

In [None]:
# tag the test documents
# it's important to set Majority vote before HMM, otherwise Majority Vote takes the HMM predictions into account
test_docs = load_data_split("test", all_labels, 1000)
test_docs = tag_all(test_docs, lfs + [mv, hmm])

num_conflicts = compute_num_conflicts(test_docs)
print("Conflicts on test set", num_conflicts)

In [None]:
df = evaluate(test_docs, all_labels, [ "mv", "hmm"])

In [None]:
df

Contrary, to the first part, we observe that the HMM performs better than majority vote.