In [None]:
!pip install skweak 

In [None]:
from google.colab import drive

drive.mount("/content/drive", force_remount=True)   

Mounted at /content/drive


In [None]:
!git clone https://github.com/NorskRegnesentral/skweak.git

In [None]:


# # data = pd.read_csv("wnut17.txt", sep="\t").fillna(method="ffill")
# data = pd.read_csv("wikigold.conll.txt", sep="\t").fillna(method="ffill")
# # data = pd.read_csv("btca.conll", sep="\t").fillna(method="ffill")
# # data['tag'][0] = 'B-Chemical'
# data.head()

Loading and Tokenizing Data

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import itertools 

def get_tokens_and_ner_tags(filename):
    with open(filename, 'r', encoding="utf8") as f:
        lines = f.readlines()
        split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
        tokens = [[x.split('\t')[0] for x in y] for y in split_list]
        entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list] 
    return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})

train_doc = get_tokens_and_ner_tags('wnut17.txt')
# train_doc = get_tokens_and_ner_tags('btca.conll')

In [None]:
docs = train_doc.tokens
lines = []
for d in docs:
  line = ""
  for i in d:
    line = line+i+" "
    # print(i)
  # print(line)
  lines.append(line)


In [None]:
lines[0]

In [None]:
from typing import Iterable, Tuple
import re, json, os
# import snips_nlu_parsers
from skweak.base import CombinedAnnotator, SpanAnnotator
from skweak.spacy import ModelAnnotator, TruecaseAnnotator
from skweak.heuristics import FunctionAnnotator, TokenConstraintAnnotator, SpanConstraintAnnotator, SpanEditorAnnotator
from skweak.gazetteers import GazetteerAnnotator, extract_json_data
from skweak.doclevel import DocumentHistoryAnnotator, DocumentMajorityAnnotator
from skweak.aggregation import MajorityVoter
from skweak import utils
from spacy.tokens import Doc, Span
import spacy
import skweak


In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_md

In [None]:


 
nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
docs = list(nlp.pipe(lines))
# docs = skweak.utils.docbin_reader(lines)
# docs = nlp(lines[0])

Detect Company Names

In [None]:

def company_detector_fun(doc):
    for chunk in doc.noun_chunks:
        if chunk[-1].lower_.rstrip(".") in {'corp', 'inc', 'ltd', 'llc', 'sa', 'ag'}:
            yield chunk.start, chunk.end, "ORG"

company_detector = skweak.heuristics.FunctionAnnotator("company_detector", company_detector_fun)
docs = list(company_detector.pipe(docs))
skweak.utils.display_entities(docs[0], "company_detector")

In [None]:
docs[5]

Plan to spend a fabulous day in Discover Downtown La Grange for their 5th Annual Chocolate Crawl on a walkable ... http://t.co/lbhSVeriLZ 

Other Organisations

In [None]:
OTHER_ORG_CUE_WORDS = {"University", "Institute", "College", "Committee", "Party", "Agency",
                       "Union", "Association", "Organization", "Court", "Office", "National"}
def other_org_detector_fun(doc):
    for chunk in doc.noun_chunks:
        if any([tok.text in OTHER_ORG_CUE_WORDS for tok in chunk]):
            yield chunk.start, chunk.end, "ORG"


other_org_detector = skweak.heuristics.FunctionAnnotator("other_org_detector", other_org_detector_fun)
docs = list(other_org_detector.pipe(docs))
skweak.utils.display_entities(docs[0], "other_org_detector")

Detection of Proper nouns

In [None]:
proper_detector = skweak.heuristics.TokenConstraintAnnotator("proper_detector", skweak.utils.is_likely_proper, "ENT")
        

nnp_detector = skweak.heuristics.TokenConstraintAnnotator("nnp_detector", lambda tok: tok.tag_=="NNP", "ENT")

compound = lambda tok: skweak.utils.is_likely_proper(tok) and skweak.utils.in_compound(tok)
compound_detector = skweak.heuristics.TokenConstraintAnnotator("compound_detector", compound, "ENT")
 
combined = skweak.base.CombinedAnnotator()

for annotator in [proper_detector, nnp_detector, compound_detector]:
    annotator.add_gap_tokens(["'s", "-"])
    combined.add_annotator(annotator)

   
    infrequent_name = "infrequent_%s"%annotator.name
    combined.add_annotator(skweak.heuristics.SpanConstraintAnnotator(infrequent_name, annotator.name, skweak.utils.is_infrequent))

docs = list(combined.pipe(docs))
skweak.utils.display_entities(docs[0], "proper_detector")
skweak.utils.display_entities(docs[0], "nnp_detector")
skweak.utils.display_entities(docs[0], "compound_detector")

Detecting Misc Entities - Doesn't work here

In [None]:
def misc_generator(doc):
    """Detects occurrences of countries and various less-common entities (NORP, FAC, EVENT, LANG)"""
    
    # spans = set(doc.spans["proper2_detector"])
    spans |= {doc[i:i+1] for i in range(len(doc))}
    
    for span in sorted(spans):

        span_text = span.text
        if span_text.isupper():
            span_text = span_text.title()
        last_token = doc[span.end-1].text

        if span_text in data_utils.COUNTRIES:
            yield span.start, span.end, "GPE"

        if len(span) <= 3 and (span in data_utils.NORPS or last_token in data_utils.NORPS 
                               or last_token.rstrip("s") in data_utils.NORPS):
            yield span.start, span.end, "NORP"
    
        if span in data_utils.LANGUAGES and doc[span.start].tag_=="NNP":
            yield span.start, span.end, "LANGUAGE"
            
        if last_token in data_utils.FACILITIES and len(span) > 1:
            yield span.start, span.end, "FAC"     

        if last_token in data_utils.EVENTS  and len(span) > 1:
            yield span.start, span.end, "EVENT"

Detect name using Regex

In [None]:
def name_detector(doc):
    """Searches for occurrences of time patterns in text"""

    i = 0
    while i < len(doc):
        tok = doc[i]

        if re.match("r'^[a-z ,.\'-]+$'", tok.text):
            yield i, i + 1, "NAME"
            i += 1
        i += 1
name_annotator = skweak.heuristics.FunctionAnnotator("name_annotator", name_detector)
docs = list(name_annotator.pipe(docs))
skweak.utils.display_entities(docs[0], "name_annotator") 

## Gazeteers

General Wiki Gazeteer for all entities

In [None]:
tries = skweak.gazetteers.extract_json_data("data/wikidata_small_tokenised.json")
annotator = skweak.gazetteers.GazetteerAnnotator("wiki", tries)
docs = list(annotator.pipe(docs))
# annotator(docs[0])
skweak.utils.display_entities(docs[0], "wiki")

Location Entity


In [None]:
tries = skweak.gazetteers.extract_json_data("data/geonames.json",  spacy_model="en_core_web_sm")
annotator = skweak.gazetteers.GazetteerAnnotator("geo_cased", tries)
annotator2 = skweak.gazetteers.GazetteerAnnotator("geo_uncased", tries, case_sensitive=False)
docs = list(annotator2.pipe(list(annotator.pipe(docs))))

skweak.utils.display_entities(docs[0], ["geo_cased", "geo_uncased"])

Detect Names

In [None]:


FIRST_NAMES = "data/first_names.json"
class FullNameDetector():
    """Search for occurrences of full person names (first name followed by at least one title token)"""

    def __init__(self):
        fd = open(FIRST_NAMES)
        self.first_names = set(json.load(fd))
        fd.close()

    def __call__(self, span: Span) -> bool:
        # We assume full names are between 2 and 5 tokens
        if len(span) < 2 or len(span) > 5:
            return False

        return (span[0].text in self.first_names and
                span[-1].is_alpha and span[-1].is_title)

proper_detector = skweak.heuristics.TokenConstraintAnnotator("proper_detector", skweak.utils.is_likely_proper, "ENT")

full_name_detector = skweak.heuristics.SpanConstraintAnnotator("full_name_detector", "proper_detector", FullNameDetector(), "PERSON")
docs = list(full_name_detector.pipe(docs))
skweak.utils.display_entities(docs[0], "full_name_detector")


Using spacy model to detect entities

In [None]:
ner = skweak.spacy.ModelAnnotator("spacy", "en_core_web_sm")
docs = list(ner.pipe(docs))
skweak.utils.display_entities(docs[0], "spacy")

In [None]:
# hmm = skweak.generative.HMM("hmm", ["COMPANY", "PERSON", "DATE", "MONEY", "ORG", "LOCATION", "GPE"])
# hmm.fit([docs[0]]*5)
# c = []

# for doc in docs:
#   doc = hmm(doc)
#   c.append(doc)
# docs = list(hmm.pipe(docs))
# utils.display_entities(docs[0], "hmm")


Aggregating the results and training the HMM model

In [None]:
model = skweak.aggregation.HMM("hmm", ["COMPANY", "PERSON", "GPE", "LOCATION"])

model.add_underspecified_label("ENT", ["LOC", "COMPANY", "ORG", "PER"])

# And run the estimation
model.fit_and_aggregate(docs)
skweak.utils.display_entities(docs[0], "hmm", add_tooltip=True) 

Starting iteration 1


RuntimeError: ignored

In [None]:

full_annotator = examples.ner.conll2003_ner.NERAnnotator().add_all()
docs = list(full_annotator.pipe(docs))