# Stages

1.   Load data
2.   Check data
3.   Prepare data for spacy
4.   Implement the model
5.   Evaluate the results



In [2]:
import pandas as pd
import spacy
import re
import requests, json 
import plac
import random
from spacy.util import minibatch, compounding

Let's download a sample dataset with 100 annotated recipes

In [3]:
sheet_id = "1r6sqFGnPqPYsXL0BYVH60DAsXfHuT2w0"
sheet_name = "Sheet1"
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

In [4]:
data = pd.read_csv(url)

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,Headline,Startup
0,0,accedo raises us$17 million in funding,accedo
1,1,uniti scores £1 million funding target through...,uniti
2,2,"cannabis inhaler producer, syqe medical, raise...",syqe medical
3,3,"alphonse's talents raises € 600,000",alphonse's talents
4,4,libon raises €1.8 million,libon


In [6]:
new_data = []
for idx, startup in data.iterrows():
# getting the index of company names in the heading
    new_data.append([m.span() for m in re.finditer(startup['Startup'], startup['Headline'])])
data['my_index'] = new_data

spacy_data = []
for idx, row in data.iterrows():
# setting up data according to spacy requirements
   spacy_data.append([row.Headline, {"entities": [(row['my_index'][0][0],row['my_index'][0][1], "ORG")]}])

In [7]:
idx = 0
spacy_data[idx]

['accedo raises us$17 million in funding', {'entities': [(0, 6, 'ORG')]}]

In [8]:
for entity in spacy_data[idx][1]['entities']:
  print(entity[2],spacy_data[idx][0][entity[0]:entity[1]])

ORG accedo


Building a train/test split after a random shuffle

In [9]:
random.shuffle(spacy_data)
train_data_size = round(len(spacy_data) * 0.8)
print(len(spacy_data), train_data_size)

2797 2238


In [10]:
train_data = spacy_data[:train_data_size]
test_data = spacy_data[train_data_size:]

print(len(train_data))
print(len(test_data))

2238
559


Loading the default spacy model and setting up the ner ML pipeline.

The training process for spacy is deailed here: https://spacy.io/usage/training#ner

In [11]:
nlp = spacy.blank("en")
if "ner" not in nlp.pipe_names:
    print('no ner')
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
else:
    print('add ner')
    ner = nlp.get_pipe("ner")

no ner


Counting how many annotating we have for every type.

In [12]:
labels = {}
for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
            if(ent[2] not in labels):
                labels[ent[2]] = 1
            else:
              labels[ent[2]] +=1
print(labels)

{'ORG': 2238}


In [13]:
n_iter=200

In [14]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):  # only train NER
    # reset and initialize the weights randomly – but only if we're
    # training a new model
    nlp.vocab.vectors.name = 'startup_pretrained_entities'
    nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
#            print([(text,annotation) for text,annotation in zip(texts,annotations)])
            nlp.update(
                texts,  # batch of texts
                annotations,  # batch of annotations
                drop=0.5,  # dropout - make it harder to memorise data
                losses=losses,
            )
        print("Iteration {} Losses".format(itn), losses)

Iteration 0 Losses {'ner': 2296.5528093623475}
Iteration 1 Losses {'ner': 1270.6767905932068}
Iteration 2 Losses {'ner': 1028.8381617840341}
Iteration 3 Losses {'ner': 821.0313050316771}
Iteration 4 Losses {'ner': 660.8623107860788}
Iteration 5 Losses {'ner': 559.2208392989152}
Iteration 6 Losses {'ner': 548.4058385584626}
Iteration 7 Losses {'ner': 482.80468972725663}
Iteration 8 Losses {'ner': 453.8788751776386}
Iteration 9 Losses {'ner': 385.1315311223855}
Iteration 10 Losses {'ner': 372.05361884063973}
Iteration 11 Losses {'ner': 370.1791273927845}
Iteration 12 Losses {'ner': 310.7235501786462}
Iteration 13 Losses {'ner': 261.8098228361881}
Iteration 14 Losses {'ner': 280.2500282545643}
Iteration 15 Losses {'ner': 262.72343357811286}
Iteration 16 Losses {'ner': 223.83822574982085}
Iteration 17 Losses {'ner': 204.9107073922918}
Iteration 18 Losses {'ner': 206.47318216952155}
Iteration 19 Losses {'ner': 156.33964898439538}
Iteration 20 Losses {'ner': 184.68527874989684}
Iteration 21 

We can apply the fitted model to new text

In [15]:
for text, _ in test_data:
    doc = nlp(text)
    print(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob_) for t in doc])
    print()

implicity raises €4 million in funding
Entities [('implicity', 'ORG')]
Tokens [('implicity', 'ORG', 'B'), ('raises', '', 'O'), ('€', '', 'O'), ('4', '', 'O'), ('million', '', 'O'), ('in', '', 'O'), ('funding', '', 'O')]

beringea leads £2.4 million investment in deepcrawl
Entities [('deepcrawl', 'ORG')]
Tokens [('beringea', '', 'O'), ('leads', '', 'O'), ('£', '', 'O'), ('2.4', '', 'O'), ('million', '', 'O'), ('investment', '', 'O'), ('in', '', 'O'), ('deepcrawl', 'ORG', 'B')]

versameb raises €5.5 million in seed c funding round
Entities [('versameb', 'ORG')]
Tokens [('versameb', 'ORG', 'B'), ('raises', '', 'O'), ('€', '', 'O'), ('5.5', '', 'O'), ('million', '', 'O'), ('in', '', 'O'), ('seed', '', 'O'), ('c', '', 'O'), ('funding', '', 'O'), ('round', '', 'O')]

israeli b2b marketing platform folloze raises $11 million
Entities [('folloze', 'ORG')]
Tokens [('israeli', '', 'O'), ('b2b', '', 'O'), ('marketing', '', 'O'), ('platform', '', 'O'), ('folloze', 'ORG', 'B'), ('raises', '', 'O'),

Saving to disk for later reuse.

In [16]:
nlp.to_disk("startup_model")

Loading from disk

In [17]:
startup_model = nlp.from_disk("startup_model")

Evaluating on test data

In [18]:
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model, examples):
    scorer = Scorer()
    for input_, annot in examples:
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot['entities'])
        pred_value = ner_model(input_)
        scorer.score(pred_value, gold)
    return scorer.scores

In [19]:
#print(test_data)
results = evaluate(startup_model, test_data)
results

{'ents_f': 91.67429094236047,
 'ents_p': 92.09558823529412,
 'ents_per_type': {'ORG': {'f': 91.67429094236047,
   'p': 92.09558823529412,
   'r': 91.2568306010929}},
 'ents_r': 91.2568306010929,
 'las': 0.0,
 'las_per_type': {'': {'f': 0.0, 'p': 0.0, 'r': 0.0}},
 'tags_acc': 0.0,
 'textcat_score': 0.0,
 'textcats_per_cat': {},
 'token_acc': 100.0,
 'uas': 0.0}

In [20]:
print("Recall",results["ents_r"])
print("Precision",results["ents_p"])
print("F-Measure",results["ents_f"])

Recall 91.2568306010929
Precision 92.09558823529412
F-Measure 91.67429094236047


Computing accuracy

In [21]:
labels = {}
for val in test_data:
    ents = [(ent.start_char, ent.end_char, ent.label_,ent.text)  for ent in nlp(val[0]).ents]
    for goldEntity in val[1]['entities']:
        if not goldEntity[2] in labels:
            labels[goldEntity[2]] = {'total':0,'detected':1}
        for detected in ents:
            if(detected[0] == goldEntity[0] and detected[1] == goldEntity[1] and detected[2] == goldEntity[2]):
                labels[goldEntity[2]]["detected"] += 1
        labels[goldEntity[2]]["total"] += 1

print("ACCURACY SCORE")
print("-----------------------------------------")
for key, value in labels.items():
    if value["total"]:
      print(key, value["detected"]/value["total"])
    else:
      print(key,'NO TEST DATA')

ACCURACY SCORE
-----------------------------------------
ORG 0.8980322003577818
