In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os
import joblib

import pandas as pd
import numpy as np

sys.path.append('../')

In [133]:
!python --version

Python 3.8.17


In [None]:
!pip uninstall -y scikit-learn && pip install scikit-learn==1.0.2
skweak==0.2.13
hmmlearn==0.2.6

In [None]:
# Weakly Supervised Named Entity Tagging with Learnable Logical Rules
# https://universaldependencies.org/format.html
# https://aclanthology.org/2021.acl-long.352.pdf

# Get data from https://github.com/explosion/projects/tree/v3/benchmarks/ud_benchmark
# by using assets command, or downloading https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/ud-treebanks-v2.5.tgz

# for each file run with vars.ud_treebank being the treebank you want to use, e.g. ("UD_English-EWT")
# python scripts/copy_files.py train conllu assets/ud-treebanks-v2.5/${vars.ud_treebank}/ corpus/${vars.ud_treebank}/train/
# python -m spacy convert corpus/${vars.ud_treebank}/train/ corpus/${vars.ud_treebank}/ --converter conllu -n 1 -T -C



In [136]:
# conll u -> skweak -> wrench

from spacy.tokens import DocBin
from spacy.training import Corpus
import os

import spacy


part3_path = "/Users/andst/projects/WS_tutorial/part_3_pos_tags"

data_path = os.path.join(part3_path, "corpus", "UD_English-EWT")

nlp = spacy.blank("xx")
reader = Corpus(os.path.join(data_path, "train.spacy"))
dev_data = list(reader(nlp))

In [137]:
all_labels = ["NOUN", "VERB", "ADV", "ADJ"]



In [141]:
from skweak_ner_eval_utils import evaluate
import skweak

from spacy.tokens import Span

docs = [doc.reference.copy() for doc in dev_data[0:3]]

for doc in docs:
    ents = []
    for tok in doc:
        if tok.pos_ in all_labels:
            print(tok.pos_)
            ents.append(Span(doc, tok.i, tok.i + 1, tok.pos_))
    doc.set_ents(ents)

noun_lf = heuristics.TokenConstraintAnnotator("noun", lambda tok: tok.pos_ == "NOUN", "NOUN")
all_noun_lf = heuristics.TokenConstraintAnnotator("all_noun", lambda tok: True, "NOUN")
verb_lf = heuristics.TokenConstraintAnnotator("verb", lambda tok: tok.pos_ == "VERB", "VERB")
all_verbs_lf = heuristics.TokenConstraintAnnotator("all_verbs", lambda tok: True, "VERB")

lfs = [noun_lf, verb_lf, all_noun_lf, all_verbs_lf]

nlp = spacy.blank("xx")

hmm = aggregation.HMM("hmm", ["NOUN", "VERB"])

for doc in docs:
    for lf in lfs:
        doc = lf(doc)
    print(doc.spans)
    
evaluate(docs, all_labels, ["all_noun", "noun", "hmm"])

ADJ
NOUN
VERB
NOUN
NOUN
NOUN
ADJ
NOUN
NOUN
ADJ
NOUN
VERB
NOUN
NOUN
VERB
ADJ
NOUN
VERB
VERB
ADJ
NOUN
VERB
{'noun': [forces, preacher, mosque, town, border], 'verb': [killed], 'all_noun': [Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.], 'all_verbs': [Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.]}
{'noun': [killing, cleric, trouble, years], 'verb': [causing, come], 'all_noun': [[This killing of a respected cleric will be causing us trouble for years to come.]], 'all_verbs': [[This killing of a respected cleric will be causing us trouble for years to come.]]}
{'noun': [authorities, cells], 'verb': [announced, busted, operating], 'all_noun': [DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.], 'all_verbs': [DPA: Iraqi authorities announced that they had busted up 3 terrorist cel

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,22.7 %,all_noun,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,22.7 %,hmm,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,22.7 %,noun,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADV,0.0 %,all_noun,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADV,0.0 %,hmm,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADV,0.0 %,noun,0.0,0.0,0.0,,,,0.0,0.0,0.0
NOUN,50.0 %,all_noun,0.172,1.0,0.294,,,,0.0,0.0,0.0
NOUN,50.0 %,hmm,0.0,0.0,0.0,,,,0.0,0.0,0.0
NOUN,50.0 %,noun,1.0,1.0,1.0,,,,1.0,1.0,1.0
VERB,27.3 %,all_noun,0.0,0.0,0.0,,,,0.0,0.0,0.0


In [140]:
utils.display_entities(docs[0], "noun")

In [35]:
import spacy, re
from skweak import heuristics, gazetteers, aggregation, utils

# LF 1: heuristic to detect occurrences of MONEY entities
def money_detector(doc):
    for tok in doc[1:]:
        if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
            yield tok.i-1, tok.i+1, "MONEY"
lf1 = heuristics.FunctionAnnotator("money", money_detector)

# LF 2: detection of years with a regex
lf2= heuristics.TokenConstraintAnnotator("years", lambda tok: re.match("(19|20)\d{2}$",
                                                                       tok.text), "DATE")

# LF 3: a gazetteer with a few names
NAMES = [("Barack", "Obama"), ("Donald", "Trump"), ("Joe", "Biden")]
trie = gazetteers.Trie(NAMES)
lf3 = gazetteers.GazetteerAnnotator("presidents", {"PERSON":trie})

# We create a corpus (here with a single text)
nlp = spacy.blank("xx")
doc = nlp("Donald Trump paid $750 in federal income taxes in 2016")

# apply the labelling functions
doc = lf3(lf2(lf1(doc)))

# and aggregate them
hmm = aggregation.HMM("hmm", ["PERSON", "DATE", "MONEY"])
hmm.fit_and_aggregate([doc])

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc, "hmm")

Starting iteration 1
Finished E-step with 1 documents
Starting iteration 2
Finished E-step with 1 documents


         1         -18.9513             +nan
         2         -19.0673          -0.1160


In [60]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Donald     Xxxxx True False
Trump     Xxxxx True False
paid     xxxx True False
$     $ False False
750     ddd False False
in     xx True False
federal     xxxx True False
income     xxxx True False
taxes     xxxx True False
in     xx True False
2016     dddd False False


In [38]:
utils.display_entities(doc, "hmm")

In [57]:
doc.spans["years"][0].label_

'DATE'

In [3]:
!pip uninstall -y skweak && pip install skweak==0.2.13
#--user git+https://github.com/NorskRegnesentral/skweak

Found existing installation: skweak 0.3.2
Uninstalling skweak-0.3.2:
  Successfully uninstalled skweak-0.3.2
Looking in indexes: https://pypi.org/simple, http://185.128.246.103/pypicloud/simple
[0mCollecting skweak==0.2.13
  Using cached skweak-0.2.13-py3-none-any.whl (34 kB)
Installing collected packages: skweak
Successfully installed skweak-0.2.13
