In [None]:
#!pip install --upgrade spacy
#!pip install spacy
#!python -m spacy download en_core_web_sm
#!pip install nltk

Requirement already up-to-date: spacy in /usr/local/lib/python3.7/dist-packages (3.0.6)
2021-04-30 20:49:33.505564: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[Errno 2] No such file or directory: 'drive/MyDrive/NLU/'
/content/drive/MyDrive/NLU


In [None]:
import conll
import spacy
from spacy.training import Alignment
from spacy.tokens import Span
from sklearn.metrics import classification_report
from collections import Counter
nlp = spacy.load('en_core_web_sm')

In [None]:
#Import the dataset CoNLL 2003
corpus = conll.read_corpus_conll("./data/test.txt")
chunks = conll.get_chunks("./data/test.txt")

# Evaluate spaCy NER on CoNLL 2003 data



In [None]:
def get_sents_from_corpus(corpus):
  sents = []
  toks = []
  tags = []
  #Dataset in format: Tok - POS - IOB POS - IOB Tag
  for sent in corpus:
    l_toks = []
    l_tags = []
    for tok in sent:
      str_tok = tok[0].split(' ')[0]
      str_tag = tok[0].split(' ')[-1]
      if(str_tok == "-DOCSTART-"):
        continue
      l_toks.append(str_tok)
      l_tags.append(str_tag)
    toks.append(l_toks)
    sents.append(' '.join(l_toks))
    tags.append(l_tags)
  return [sents, toks, tags]

In [None]:
[corp_list, corp_toks, corp_tags] = get_sents_from_corpus(corpus)

## Token-level performance

In [None]:
def map_tags_from_spacy(doc):
  #Returns a list of tags in the same format as in the CoNLL dataset
  tags = []
  switch_dict = {
      "FAC": "LOC",
      "GPE": "LOC",
      "LOC": "LOC",
      "NORP": "ORG",
      "ORG": "ORG",
      "PERSON": "PER"
  }
  for tok in doc:
    if tok.ent_iob_ == "O":
      tags.append(tok.ent_iob_)
    else: 
      tags.append(tok.ent_iob_+"-"+switch_dict.get(tok.ent_type_, "MISC"))
  return tags

In [None]:
#Align the spaCy tokens on the CoNLL tokens
def align_tokens_and_tags(spacy_toks, spacy_tags, conll_toks):
  #Get the mapping
  mapping = Alignment.from_strings(conll_toks, spacy_toks).x2y.lengths
  #Correct the lists of tags and tokens
  new_toks = []
  new_tags = []
  spacy_i = 0
  for i, t in enumerate(mapping):
    new_tags.append(spacy_tags[spacy_i])
    if t == 1:
      new_toks.append(spacy_toks[spacy_i])
    else:
      new_tok = ''.join(spacy_toks[spacy_i:spacy_i+t])
      new_toks.append(new_tok)
    spacy_i += t
  return [new_toks, new_tags]

In [None]:
#Get spaCy's predictions
spacy_tags = []
spacy_toks = []
for i, sent in enumerate(corp_list):
  doc = nlp(sent)
  [toks, tags] = align_tokens_and_tags([t.text for t in doc], map_tags_from_spacy(doc), corp_toks[i])
  spacy_toks.append(toks)
  spacy_tags.append(tags)

In [None]:
rep_spacy = [l for sublist in spacy_tags for l in sublist]
rep_conll = [l for sublist in corp_tags for l in sublist]
report = classification_report(rep_conll, rep_spacy)
print(report)

              precision    recall  f1-score   support

       B-LOC       0.76      0.68      0.72      1668
      B-MISC       0.02      0.09      0.03       702
       B-ORG       0.38      0.32      0.35      1661
       B-PER       0.80      0.63      0.70      1617
       I-LOC       0.54      0.56      0.55       257
      I-MISC       0.05      0.36      0.08       216
       I-ORG       0.42      0.52      0.46       835
       I-PER       0.84      0.79      0.81      1156
           O       0.94      0.86      0.90     38323

    accuracy                           0.80     46435
   macro avg       0.53      0.53      0.51     46435
weighted avg       0.88      0.80      0.84     46435



## Chunk-level performance

In [None]:
#Get the data in the correct format
chunks_eval_list = []
for i, sent in enumerate(spacy_toks):
  chunks_sent = []
  for j, tok in enumerate(sent):
    chunks_sent.append([tok, corp_tags[i][j], spacy_tags[i][j]])
  chunks_eval_list.append(chunks_sent)
results = conll.conlleval(chunks_eval_list)

In [None]:
results
#p is Precision
#r is Recall
#f is F1 measure
#s is the count

{'LOC': {'f': 0.7079254815282602,
  'p': 0.7478318879252835,
  'r': 0.6720623501199041,
  's': 1668},
 'MISC': {'f': 0.02575721440496065,
  'p': 0.015468347178458894,
  'r': 0.07692307692307693,
  's': 702},
 'ORG': {'f': 0.3127057274522712,
  'p': 0.3449527959331881,
  'r': 0.2859723058398555,
  's': 1661},
 'PER': {'f': 0.6814404432132963,
  'p': 0.7741935483870968,
  'r': 0.608534322820037,
  's': 1617},
 'total': {'f': 0.39650760198705404,
  'p': 0.3448546739984289,
  'r': 0.4663597733711048,
  's': 5648}}

# Grouping of entities
Write a function to group recognized named entities using `noun_chunks` method of spaCy. 
<br>Analyze the groups in terms of most frequent combinations (i.e. NER types that go together)

In [None]:
def get_ne_groups(list_sent):
  sent_groups = []
  for sent in list_sent:
    doc = nlp(sent)
    nc = [] #Boundaries of the noun chunks that contain at least one NE
    for c in doc.noun_chunks:
      if c.ents:
        nc.append(c.start)
        nc.append(c.end)
    groups = []
    if not nc:
      for ne in doc.ents:
        groups.append(sorted(list(dict.fromkeys([tok.ent_type_ for tok in ne]))))
    else:
      nc_i = 0
      for ne in doc.ents:
        group = []
        start_ne = ne.start #Start of the current NE
        start_chunk = nc[nc_i] #Start of the next noun chunk with a NE
        if start_ne >= nc[nc_i] and start_ne < nc[min(nc_i+1, len(nc)-1)]:
          #The NE is part of a noun chunk that hasn't been added yet
          for tok in doc[nc[nc_i]:nc[nc_i+1]]:
            #Only add the type if there is one and it hasn't been added to the group yet
            if tok.ent_type_ != "" and tok.ent_type_ not in group: 
              group.append(tok.ent_type_)
          groups.append(sorted(group))
          nc_i = min(nc_i+2, len(nc)-1)
        elif start_ne < nc[nc_i] and start_ne >= nc[max(0,nc_i-1)] or start_ne >= nc[nc_i]:
          groups.append(sorted(list(dict.fromkeys([tok.ent_type_ for tok in ne]))))
    sent_groups.append(groups)
  return sent_groups

In [None]:
grouped_ne = get_ne_groups(corp_list)

In [None]:
group_counts = Counter()
for sent in grouped_ne:
  for group in sent:
    group_counts.update([' '.join(group)])
group_counts.most_common()

[('CARDINAL', 1399),
 ('GPE', 1300),
 ('PERSON', 1093),
 ('DATE', 956),
 ('ORG', 874),
 ('NORP', 305),
 ('MONEY', 140),
 ('ORDINAL', 121),
 ('TIME', 81),
 ('PERCENT', 78),
 ('EVENT', 63),
 ('CARDINAL PERSON', 59),
 ('QUANTITY', 55),
 ('LOC', 51),
 ('NORP PERSON', 44),
 ('GPE PERSON', 39),
 ('PRODUCT', 30),
 ('ORG PERSON', 25),
 ('CARDINAL ORG', 25),
 ('FAC', 21),
 ('GPE ORG', 20),
 ('CARDINAL GPE', 18),
 ('CARDINAL NORP', 16),
 ('DATE ORG', 15),
 ('NORP ORG', 12),
 ('GPE PRODUCT', 10),
 ('WORK_OF_ART', 9),
 ('DATE EVENT', 8),
 ('LANGUAGE', 7),
 ('DATE TIME', 7),
 ('NORP ORDINAL', 6),
 ('LAW', 6),
 ('DATE NORP', 6),
 ('DATE GPE', 5),
 ('GPE ORDINAL', 5),
 ('CARDINAL DATE', 5),
 ('ORDINAL PERSON', 5),
 ('ORDINAL ORG', 3),
 ('DATE NORP PERSON', 3),
 ('EVENT ORDINAL', 3),
 ('FAC GPE', 3),
 ('CARDINAL EVENT', 3),
 ('CARDINAL GPE PERSON', 2),
 ('CARDINAL ORDINAL', 2),
 ('LANGUAGE ORDINAL', 2),
 ('CARDINAL PERCENT', 2),
 ('DATE GPE ORG', 2),
 ('MONEY ORG', 2),
 ('GPE LOC', 2),
 ('GPE NORP', 2

# Write a function that extends the entity span to cover the full noun-compounds
One of the possible post-processing steps is to fix segmentation errors.  Make use of `compound` dependency relation.

In [None]:
spacy_tags_post = []
for i, sent in enumerate(corp_list):
  doc = nlp(sent)
  #Modify the entities if there noun compounds
  doc = modify_ne_noun_compounds(doc)
  #Map the tokens
  [toks, tags] = align_tokens_and_tags([t.text for t in doc], map_tags_from_spacy(doc), corp_toks[i])
  spacy_tags_post.append(tags)


SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
Nadim Ladki
AL-AIN , United Arab Emirates 1996-12-06
Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday .
But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan .
China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net .
Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area .
The former Soviet republic was playing in an Asian Cup finals tie for the first time .
Despite winning the Asian Games title two years ago , Uzbekistan are in the finals as outsiders .
Two goals from defensive errors in the last six minutes allowed Japan to come f

ValueError: ignored

In [None]:
def modify_ne_noun_compounds(doc):
  new_spans = []
  min_i = 0
  for ne in doc.ents:
    iob_type = ne.label_
    compounds = []
    for tok in ne:
      compounds += get_imm_compound_tokens(tok, [])
    if compounds:
      min_span = min(min([t.i for t in compounds]), ne.start)
      max_span = max(max([t.i for t in compounds]), ne.end)
      new_span = Span(doc, min_span, max_span, label=iob_type)
      new_spans.append(new_span)
      min_i = max_span
    else:
      new_spans.append(ne)
  doc.ents = new_spans
  return doc

In [None]:
def get_imm_compound_tokens(tok, l_tok):
  for t in tok.children:
      if t.dep_ == "compound" and t.ent_iob_=="O":
        l_tok.append(t)
        l_tok = get_imm_compound_tokens(t, l_tok)
  return l_tok