In [1]:
%load_ext autoreload
%autoreload 2

import re
import os
import sys
import spacy
import joblib
import skweak
import numpy as np
import pandas as pd

from spacy.tokens import Span
from spacy.tokens import DocBin
from collections import Counter
from spacy.training import Corpus
from skweak_ner_eval_utils import evaluate
# from skweak import heuristics, gazetteers, aggregation, utils

sys.path.append('../')

In [2]:
!python --version

Python 3.8.17


In [None]:
# Weakly Supervised Named Entity Tagging with Learnable Logical Rules
# https://universaldependencies.org/format.html
# https://aclanthology.org/2021.acl-long.352.pdf

# Get data from https://github.com/explosion/projects/tree/v3/benchmarks/ud_benchmark
# by using assets command, or downloading https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3105/ud-treebanks-v2.5.tgz

# for each file run with vars.ud_treebank being the treebank you want to use, e.g. ("UD_English-EWT")
# python scripts/copy_files.py train conllu assets/ud-treebanks-v2.5/${vars.ud_treebank}/ corpus/${vars.ud_treebank}/train/
# python -m spacy convert corpus/${vars.ud_treebank}/train/ corpus/${vars.ud_treebank}/ --converter conllu -n 1 -T -C



## Load data

In [3]:
# conll u -> skweak -> wrench

# Dataset folder
part3_path = "part_3_pos_tags"

# Path to the dataset file
data_path = os.path.join(part3_path, "corpus", "UD_English-EWT")

# Create a blank spacy pipeline
nlp = spacy.blank("xx")
reader = Corpus(os.path.join(data_path, "train.spacy"))
dev_data = list(reader(nlp))

In [4]:
# Toy example: use a subset
docs = [doc.reference.copy() for doc in dev_data[0:500]]

## Part-of-speech (POS) tagging

We want to assign a POS tag to each token. We will create labeling functions to assign POS tags based on linguistic rules.

Here, we chose to use the following POS tags:
*      DET: determiner, which is a word placed in front of a noun to make it clear what the noun refers to.
*      NUM: numeral.
*      PROPN: proper noun is a specific name for a particular person, place, or thing.
*      ADJ: adjective, which names an attribute of a noun.
*      VERB: verb, which showes an action, occurrence, or state of being.
*      NOUN: noun, a word (other than a pronoun) used to identify any of a class of people, places, or things (common noun).

In [5]:
all_labels = ["DET", "NUM", "PROPN", "ADJ", "VERB", "NOUN"]

In [6]:
# Set the gold labels
for doc in docs:
    print([s.text for s in doc.sents])
    ents = []
    tok_pos = []
    for tok in doc:
        if tok.pos_ in all_labels:
            # print(tok.pos_)
            tok_pos.append(tok.pos_)
            ents.append(Span(doc, tok.i, tok.i + 1, tok.pos_))
        else:
            tok_pos.append("O")
    doc.set_ents(ents)
    print(tok_pos)

['Al-Zaman : American forces killed Shaikh Abdullah al-Ani, the preacher at the mosque in the town of Qaim, near the Syrian border.']
['PROPN', 'O', 'PROPN', 'O', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'O', 'PROPN', 'O', 'DET', 'NOUN', 'O', 'DET', 'NOUN', 'O', 'DET', 'NOUN', 'O', 'PROPN', 'O', 'O', 'DET', 'ADJ', 'NOUN', 'O']
['[This killing of a respected cleric will be causing us trouble for years to come.]']
['O', 'DET', 'NOUN', 'O', 'DET', 'ADJ', 'NOUN', 'O', 'O', 'VERB', 'O', 'NOUN', 'O', 'NOUN', 'O', 'VERB', 'O', 'O']
['DPA: Iraqi authorities announced that they had busted up 3 terrorist cells operating in Baghdad.']
['PROPN', 'O', 'ADJ', 'NOUN', 'VERB', 'O', 'O', 'O', 'VERB', 'O', 'NUM', 'ADJ', 'NOUN', 'VERB', 'O', 'PROPN', 'O']
['Two of them were being run by 2 officials of the Ministry of the Interior!']
['NUM', 'O', 'O', 'O', 'O', 'VERB', 'O', 'NUM', 'NOUN', 'O', 'DET', 'PROPN', 'O', 'DET', 'PROPN', 'O']
['The MoI in Iraq is equivalent to the US FBI, so this would b

## Labeling functions

In the first step we find the 200 most frequent words in our corpus and use a lexicon to label these words. 
Then, for each POS tag we will create the following labeling functions: 

*   DET --> Lexicon with determiners.
*   NUM --> If the token is a number.
*   PROPN --> A word that is capitalized.
*   ADJ --> Suffixes: “able”, “al”, “ful”, “ic”, “ive”, “less”, “ous”, ”y”, “ish”, “ible”, "est".
*   NOUN --> 1. Suffixes: "ment", "tion", "sion", "xion", "ant", "ent", "ee", "er", "or", "ism", "ist", "ness", "ship", "ity", "ance", "ence", "ar", "or", "y", "acy", "age" , 2. Linguistic rule: if the previous word is a DET, a NUM or an ADJ, then the current one is a NOUN.
*   VERB --> 1. Suffixes: "ing", "ate", "en", "ed", "ify", "ise", "ize", 2. Linguistic rule: if the previous word is a NOUN, then the current one is a VERB, 3. Previous word is a form of "be".

In [7]:
words = [token.text for doc in docs for token in doc if not token.is_punct]

In [8]:
# Find the most frequent words
word_freq = Counter(words)
common_words = [w[0] for w in word_freq.most_common(200)]

In [9]:
common_words[:5]

['the', 'of', 'and', 'to', 'in']

In [10]:
# Load the lexicon
with open("eng_pos_tags_file.txt") as f:
    lines = f.readlines()

lines = lines[100:]

In [11]:
lexicon = {}
for l in lines:
    values = l.replace("\n", "").split("\t")
    if len(values) >=3 :
        # Map tags
        # DET --> DT
        if values[2] == "DT":
            lexicon[values[0]] = "DET"
        # NUM --> CD, ORD
        if values[2] == "CD" or values[2] == "ORD":
            lexicon[values[0]] = "NUM"
        # PROPN --> NNP, NNPS
        if values[2] == "NNP" or values[2] == "NNPS":
            lexicon[values[0]] = "PROPN"
        # ADJ --> JJ, JJR, JJS
        if values[2] == "JJ" or values[2] == "JJR" or values[2] == "JJS":
            lexicon[values[0]] = "ADJ"
        # NOUN --> NN, NNS, NN:U, NN:UN
        if values[2] == "NN" or values[2] == "NNS" or values[2] == "NN:U" or values[2] == "NN:UN":
            lexicon[values[0]] = "NOUN"
        # VERB --> VB, VBD, VBG, VBN, VBP, VBZ
        if values[2] == "VB" or values[2] == "VBD" or values[2] == "VBG" or values[2] == "VBN" or values[2] == "VBP" or values[2] == "VBZ":
            lexicon[values[0]] = "VERB"

In [12]:
len(lexicon)

28545

In [30]:
"Town" in lexicon

False

In [31]:
for t in docs[0]:
    print(t.text) # in list(lexicon.keys()))

Al
-
Zaman
:
American
forces
killed
Shaikh
Abdullah
al
-
Ani
,
the
preacher
at
the
mosque
in
the
town
of
Qaim
,
near
the
Syrian
border
.


In [23]:
list(lexicon.keys())[:100]

['2019-nCoV',
 '3D',
 '3M',
 '7-Eleven',
 'A&A',
 'A&D',
 'A&E',
 'A&M',
 'A&P',
 'Aadhaar',
 'AAMC',
 'Aarambh',
 'Aarav',
 'AARP',
 'Aarush',
 'Aaryan',
 'Aashni',
 'ab',
 'abatements',
 'abbé',
 'abbés',
 'Abbeydale',
 'Abdallah',
 'Abderrahim',
 'Abdirahman',
 'Abdirashid',
 'abdominoscopy',
 'Abdoulaye',
 'Abdoulie',
 'Abdulaziz',
 'Abdulhamid',
 'Abdullah',
 'Abdullahi',
 'Abdulrahman',
 'Abdurrahman',
 'abelian',
 'Abhijeet',
 'Abhijit',
 'Abhimanyu',
 'Abhinav',
 'Abhishek',
 'Abi',
 'ABI',
 'Abilify',
 'ableist',
 'ablepsy',
 'aboriginal',
 'abort',
 'aborts',
 'Aboubacar',
 'Abouzeid',
 'Abrahamic',
 'abrin',
 'abs',
 'absolutisation',
 'absolutise',
 'absolutised',
 'absolutises',
 'absolutising',
 'absolutization',
 'absolutize',
 'absolutized',
 'absolutizes',
 'absolutizing',
 'abuela',
 'abuelas',
 'academise',
 'academised',
 'academises',
 'academising',
 'academize',
 'academized',
 'academizes',
 'academizing',
 'açai',
 'acapnia',
 'acc',
 'Accenture',
 'accessorise

In [13]:
# Lexicon LF
def common_word_detector(doc):
    for token in doc:
        if token.text in list(lexicon.keys()):
            yield token.i, token.i+1, lexicon[token.text]

word_lf = skweak.heuristics.FunctionAnnotator("common_words", common_word_detector)


for doc in docs:
    doc = word_lf(doc)
    skweak.utils.display_entities(doc, "common_words")


In [14]:
# DET LF
tries = skweak.gazetteers.extract_json_data("det.json")
det_lf = skweak.gazetteers.GazetteerAnnotator("determiners", tries, case_sensitive=False)

for doc in docs:
    doc = det_lf(doc)
    skweak.utils.display_entities(doc, "determiners")

Extracting data from det.json
Populating trie for class DET (number: 47)


In [16]:
# # Or DET LF without json
# det_list = ["the", "a", "an", "this", "that", "these", "those", "my", "your", "his", "her", "its", "our", "their", "a few", "few",
#             "fewer", "fewest", "a little", "little", "much", "many", "more", "a lot of", "most", "some", "any", "enough", "all",
#             "both", "half", "either",  "neither", "no", "each", "every", "other", "another", "several", "such", "what", "rather",
#             "quite", "least", "less", "which", "whose"]

# tries = skweak.gazetteers.Trie(det_list)
# det_lf = skweak.gazetteers.GazetteerAnnotator("determiners", {"DET":tries}, case_sensitive=False)

# for doc in docs:
#     det_lf(doc)
#     skweak.utils.display_entities(doc, "determiners")

In [15]:
# NUM LF

def num_detector(doc):
    for token in doc:
        if re.search("\d+", token.text):
            yield token.i, token.i+1, "NUM"

num_lf = skweak.heuristics.FunctionAnnotator("numerals", num_detector)


for doc in docs:
    doc = num_lf(doc)
    skweak.utils.display_entities(doc, "numerals")

In [16]:
# PROPN LF

def propn_detector(doc):
    for token in doc:
        if token.i == 0:
            if token.text.isupper():
                yield token.i, token.i+1, "PROPN"
        else:
            if token.text.isupper() or token.text[0].isupper():
                yield token.i, token.i+1, "PROPN"

propn_lf = skweak.heuristics.FunctionAnnotator("proper_nouns", propn_detector)

for doc in docs:
    doc = propn_lf(doc)
    skweak.utils.display_entities(doc, "proper_nouns")

In [17]:
# ADJ LF

def adj_detector(doc):
    for token in doc:
        if len(token.text)>3 and token.text.lower().strip(".").endswith(("able", "al", "ful", "ic", "ive", "less", "ous", "y", "ish", "ible", "ent", "est")):
            yield token.i, token.i+1, "ADJ"
                    

adj_lf = skweak.heuristics.FunctionAnnotator("adjectives", adj_detector)


for doc in docs:
    doc = adj_lf(doc)
    skweak.utils.display_entities(doc, "adjectives")


In [18]:
# NOUN LF

def noun_detector_suffixes(doc):
    for token in doc:
        if len(token.text)>3 and token.text.lower().strip(".").endswith(("ment", "tion", "sion", "xion", "ant", "ent", "ee", "er", "or", 
                                                                         "ism", "ist", "ness", "ship", "ity", "ance", "ence", 
                                                                         "ar", "or", "y", "acy", "age")):
            yield token.i, token.i+1, "NOUN"

def noun_detector_ling(doc):
    weak_labels = ["O"]*len(doc)

    for span in doc.spans["determiners"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["numerals"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["adjectives"]:
        weak_labels[span.start] = span.label_
    
    for token in doc[1:]:
        if not token.is_punct:
            if weak_labels[token.i-1] != "O":
                yield token.i, token.i+1, "NOUN"
        
noun_lf1 = skweak.heuristics.FunctionAnnotator("nouns1", noun_detector_suffixes)
noun_lf2 = skweak.heuristics.FunctionAnnotator("nouns2", noun_detector_ling)

for doc in docs:
    doc = noun_lf2(noun_lf1(doc))
    skweak.utils.display_entities(doc, ["nouns1", "nouns2"])


In [19]:
# VERB LF

def verb_detector_suffixes(doc):
    for token in doc:
        if len(token.text)>3 and token.text.lower().strip(".").endswith(("ing", "ate", "en", "ed", "ify", "ise", "ize")):
            yield token.i, token.i+1, "VERB"

def verb_detector_ling(doc):
    weak_labels = ["O"]*len(doc)

    for span in doc.spans["nouns1"]:
        weak_labels[span.start] = span.label_

    for span in doc.spans["nouns2"]:
        weak_labels[span.start] = span.label_
    
    for token in doc[1:]:
        if not token.is_punct:
            if weak_labels[token.i-1] != "O":
                yield token.i, token.i+1, "VERB"

verb_lf1 = skweak.heuristics.FunctionAnnotator("verbs1", verb_detector_suffixes)
verb_lf2 = skweak.heuristics.FunctionAnnotator("verbs2", verb_detector_ling)

for doc in docs:
    doc = verb_lf2(verb_lf1(doc))
    skweak.utils.display_entities(doc, ["verbs1", "verbs2"])


In [20]:
import pandas
pandas.set_option('display.max_rows', 200)

In [21]:


# noun_lf = heuristics.TokenConstraintAnnotator("noun", lambda tok: tok.pos_ == "NOUN", "NOUN")
# all_noun_lf = heuristics.TokenConstraintAnnotator("all_noun", lambda tok: True, "NOUN")
# verb_lf = heuristics.TokenConstraintAnnotator("verb", lambda tok: tok.pos_ == "VERB", "VERB")
# all_verbs_lf = heuristics.TokenConstraintAnnotator("all_verbs", lambda tok: True, "VERB")

lfs = [det_lf, propn_lf, num_lf, adj_lf, noun_lf1, noun_lf2, verb_lf1, word_lf]

nlp = spacy.blank("xx")

# hmm = aggregation.HMM("hmm", ["DET"])

for doc in docs:
    for lf in lfs:
        doc = lf(doc)
    # print(doc.spans)


hmm = skweak.aggregation.HMM("hmm", ["NOUN", "VERB", "DET", "NUM", "PROPN", "ADJ"])
hmm.fit(docs)

for doc in docs:
    doc = hmm(doc)
    
evaluate(docs, all_labels, ["determiners", "proper_nouns", "adjectives", "numerals", "nouns1", "nouns2", "verbs1", "verbs2", "word_lf", "hmm"])
#evaluate(docs, all_labels, ["proper_nouns"])

Starting iteration 1
Finished E-step with 500 documents
Starting iteration 2


         1      -29012.4527             +nan


Finished E-step with 500 documents
Starting iteration 3


         2      -26164.1369       +2848.3158


Finished E-step with 500 documents
Starting iteration 4


         3      -24957.8346       +1206.3023


Finished E-step with 500 documents
Starting iteration 5


         4      -24671.2902        +286.5444


Finished E-step with 500 documents


         5      -24580.6052         +90.6850


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tok_precision,tok_recall,tok_f1,tok_cee,tok_acc,coverage,ent_precision,ent_recall,ent_f1
label,proportion,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ADJ,12.6 %,adjectives,0.329,0.371,0.348,,,,0.329,0.371,0.348
ADJ,12.6 %,determiners,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.6 %,hmm,0.244,0.17,0.2,,,,0.244,0.17,0.2
ADJ,12.6 %,nouns1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.6 %,nouns2,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.6 %,numerals,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.6 %,proper_nouns,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.6 %,verbs1,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.6 %,verbs2,0.0,0.0,0.0,,,,0.0,0.0,0.0
ADJ,12.6 %,word_lf,0.0,0.0,0.0,,,,0.0,0.0,0.0


In [9]:
utils.display_entities(docs[0], "noun")

In [35]:
import spacy, re
from skweak import heuristics, gazetteers, aggregation, utils

# LF 1: heuristic to detect occurrences of MONEY entities
def money_detector(doc):
    for tok in doc[1:]:
        if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
            yield tok.i-1, tok.i+1, "MONEY"
lf1 = heuristics.FunctionAnnotator("money", money_detector)

# LF 2: detection of years with a regex
lf2= heuristics.TokenConstraintAnnotator("years", lambda tok: re.match("(19|20)\d{2}$",
                                                                       tok.text), "DATE")

# LF 3: a gazetteer with a few names
NAMES = [("Barack", "Obama"), ("Donald", "Trump"), ("Joe", "Biden")]
trie = gazetteers.Trie(NAMES)
lf3 = gazetteers.GazetteerAnnotator("presidents", {"PERSON":trie})

# We create a corpus (here with a single text)
nlp = spacy.blank("xx")
doc = nlp("Donald Trump paid $750 in federal income taxes in 2016")

# apply the labelling functions
doc = lf3(lf2(lf1(doc)))

# and aggregate them
hmm = aggregation.HMM("hmm", ["PERSON", "DATE", "MONEY"])
hmm.fit_and_aggregate([doc])

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc, "hmm")

Starting iteration 1
Finished E-step with 1 documents
Starting iteration 2
Finished E-step with 1 documents


         1         -18.9513             +nan
         2         -19.0673          -0.1160


In [60]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

Donald     Xxxxx True False
Trump     Xxxxx True False
paid     xxxx True False
$     $ False False
750     ddd False False
in     xx True False
federal     xxxx True False
income     xxxx True False
taxes     xxxx True False
in     xx True False
2016     dddd False False


In [38]:
utils.display_entities(doc, "hmm")

In [57]:
doc.spans["years"][0].label_

'DATE'

In [3]:
!pip uninstall -y skweak && pip install skweak==0.2.13
#--user git+https://github.com/NorskRegnesentral/skweak

Found existing installation: skweak 0.3.2
Uninstalling skweak-0.3.2:
  Successfully uninstalled skweak-0.3.2
Looking in indexes: https://pypi.org/simple, http://185.128.246.103/pypicloud/simple
[0mCollecting skweak==0.2.13
  Using cached skweak-0.2.13-py3-none-any.whl (34 kB)
Installing collected packages: skweak
Successfully installed skweak-0.2.13
