In [33]:
import pandas as pd
import conll
import spacy
import numpy
from tqdm import tqdm
from collections import defaultdict
from spacy.tokens import Token, Doc
from sklearn.metrics import classification_report

In [11]:
path="test.txt"

### **Q1: Evaluate spaCy NER on CoNLL 2003 data (provided)**

In [12]:
nlp = spacy.load("en_core_web_sm")
temporary_corpus = conll.read_corpus_conll(path)

In [13]:
nerDict= {
    "CARDINAL": "MISC",
    "DATE": "MISC",
    "EVENT": "MISC",
    "FAC": "LOC",
    "GPE": "LOC",
    "LANGUAGE": "MISC",
    "LAW": "MISC",
    "LOC": "LOC",
    "MONEY": "MISC",
    "NORP": "MISC",
    "ORDINAL": "MISC",
    "ORG": "ORG",
    "PERCENT": "MISC",
    "PERSON": "PER",
    "PRODUCT": "ORG",
    "QUANTITY": "MISC",
    "TIME": "MISC",
    "WORK_OF_ART": "MISC",
    "": ""
}

In [14]:
def convertNER(index):
    sentenceArr=[]
    tagLabel=[]
    ref = []
    hyp = []
    for i in range(len(temporary_corpus[index])):
        current_corpus = temporary_corpus[index][i][0].split()
        sentenceArr.append(current_corpus[0])
        tagLabel.append(current_corpus[-1])
    if "-DOCSTART-" not in sentenceArr:
        sentence = (" ".join(sentenceArr))
        doc = nlp(sentence)
        for i in range(len(temporary_corpus[index])):
            iob=doc[i].ent_iob_
            ent_type=doc[i].ent_type_
            cur_tag = " "
            if iob=="O" or nerDict[ent_type]=="" :
                cur_tag = "O"
            else:
                cur_tag = "{}-{}".format(iob, nerDict[ent_type])
            
            ref.append((str(doc[i]), cur_tag))
            hyp.append((str(doc[i]), tagLabel[i]))
        return ref, hyp
    else:
        return "",""

In [15]:
refs = []
hyps = []
for i in range(len(temporary_corpus)):  #for whole dataset
#for i in range(100): #limited dataset for faster results
    cur_ref, cur_hyp = convertNER(i)
    if cur_ref == "":
        continue
    refs.append(cur_ref)
    hyps.append(cur_hyp)

In [19]:
ref_tags = [tag[1] for ref in refs for tag in ref]
ref_labels = [tag[1] for hyp in hyps for tag in hyp]

In [20]:
## TOKEN EVAL
from sklearn.metrics import classification_report
print(classification_report(ref_labels, ref_tags))

              precision    recall  f1-score   support

       B-LOC       0.61      0.56      0.59      1668
      B-MISC       0.08      0.51      0.14       702
       B-ORG       0.46      0.31      0.37      1661
       B-PER       0.58      0.46      0.51      1617
       I-LOC       0.44      0.49      0.46       257
      I-MISC       0.04      0.36      0.08       216
       I-ORG       0.38      0.49      0.43       835
       I-PER       0.51      0.56      0.53      1156
           O       0.93      0.82      0.87     38323

    accuracy                           0.76     46435
   macro avg       0.45      0.51      0.44     46435
weighted avg       0.85      0.76      0.80     46435



In [21]:
## CHUNK EVAL
results = conll.evaluate(refs, hyps)
print(pd.DataFrame().from_dict(results, orient='index').round(decimals=3))

           p      r      f     s
MISC   0.484  0.080  0.137  4270
ORG    0.264  0.402  0.319  1092
PER    0.424  0.540  0.475  1271
LOC    0.550  0.603  0.576  1522
total  0.422  0.292  0.345  8155


### Q2: **Grouping of Entities. Write a function to group recognized named entities using noun_chunks method of spaCy. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together).**

In [None]:
def groupEnt(sentence):
  

def freqCount(groups):
  dict_group = defaultdict(int)
  for tok in groups:
    key = ", ".join([i for i in tok])
    dict_key[key] = dict_key[key] + 1
  return dict_key

groups = []
freq = defaultdict(int)
for sentence in tqdm(test[:]):
  x = " ".join([s[0] for s in sentence])
  my_tokenization = [s[0] for s in sentence]
  g = group(x)
  frequency_group(g, freq)
  groups.append(g)

counts = pd.DataFrame().from_dict(freq, orient='index', columns=["Count"]).sort_values("Count", ascending=False)
counts.round(decimals=3).head(10)