# Named Entity Recognition with Spacy

We have used tutorial from: https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7

### Configuration

In [1]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

### Training

In [2]:
TRAIN_DATA = [
    ('My main research interests are in machine learning, artificial intelligence, and theoretical computer science.', {
        'entities': [(34,50, 'AREA'), (52, 75, 'AREA'), (81, 109, 'AREA')]
    }),
    ('My primary research areas are computational Biology, Bioinformatics and Machine learning.', {
        'entities': [(53,67, 'AREA'), (72,88, 'AREA')]
    }),
    ('I am interested in the intersection of machine learning and systems.', {
        'entities': [(39,55, 'AREA'), (60,77, 'AREA')]
    }),
    ('I work on developing and using Machine Learning, AI, and Data Science methods.', {
        'entities': [(31,47, 'AREA'), (49,51, 'AREA'), (57,69, 'AREA')]
    }),
    ('I am interested in the intersection of machine learning and systems.', {
        'entities': [(39,55, 'AREA'), (60,67, 'AREA')]
    }),    
]


### Setup Language Model

In [3]:
model = None
output_dir=Path("./")
n_iter=100

#load the model
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")
    
#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

Created blank 'en' model


### Training

In [4]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 5/5 [00:00<00:00, 41.49it/s]
100%|██████████| 5/5 [00:00<00:00, 55.16it/s]
100%|██████████| 5/5 [00:00<00:00, 55.72it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 51.67547598481178}
{'ner': 36.20493755862117}
{'ner': 22.9045398584567}


100%|██████████| 5/5 [00:00<00:00, 52.62it/s]
100%|██████████| 5/5 [00:00<00:00, 51.08it/s]
100%|██████████| 5/5 [00:00<00:00, 51.24it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 19.499939648820146}
{'ner': 19.816921715313583}
{'ner': 19.815520729058335}


100%|██████████| 5/5 [00:00<00:00, 54.46it/s]
100%|██████████| 5/5 [00:00<00:00, 52.10it/s]
100%|██████████| 5/5 [00:00<00:00, 50.86it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 18.98602657049196}
{'ner': 18.171324101276696}
{'ner': 14.885559097630903}


100%|██████████| 5/5 [00:00<00:00, 47.08it/s]
100%|██████████| 5/5 [00:00<00:00, 50.19it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 19.834681788845955}
{'ner': 18.61323579434611}


100%|██████████| 5/5 [00:00<00:00, 48.84it/s]
100%|██████████| 5/5 [00:00<00:00, 51.02it/s]
100%|██████████| 5/5 [00:00<00:00, 51.76it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 19.392367718534842}
{'ner': 16.09915528325564}
{'ner': 21.598483065076493}


100%|██████████| 5/5 [00:00<00:00, 48.97it/s]
100%|██████████| 5/5 [00:00<00:00, 47.42it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 16.4583444109617}
{'ner': 19.202495730943156}


100%|██████████| 5/5 [00:00<00:00, 39.67it/s]
100%|██████████| 5/5 [00:00<00:00, 47.30it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 12.922788605246556}
{'ner': 9.855551897517497}


100%|██████████| 5/5 [00:00<00:00, 48.62it/s]
100%|██████████| 5/5 [00:00<00:00, 47.21it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 6.8057888288300505}
{'ner': 12.665175162069934}


100%|██████████| 5/5 [00:00<00:00, 47.93it/s]
100%|██████████| 5/5 [00:00<00:00, 52.48it/s]
100%|██████████| 5/5 [00:00<00:00, 55.43it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 11.50571331529249}
{'ner': 4.458763746519036}
{'ner': 11.538580056530291}


100%|██████████| 5/5 [00:00<00:00, 54.21it/s]
100%|██████████| 5/5 [00:00<00:00, 53.73it/s]
100%|██████████| 5/5 [00:00<00:00, 56.04it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 10.89994016845592}
{'ner': 7.154000123282688}
{'ner': 6.0828154923528155}


100%|██████████| 5/5 [00:00<00:00, 54.56it/s]
100%|██████████| 5/5 [00:00<00:00, 56.01it/s]
100%|██████████| 5/5 [00:00<00:00, 54.54it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 2.6909585801322975}
{'ner': 2.797420155504378}
{'ner': 1.7534945984746957}


100%|██████████| 5/5 [00:00<00:00, 50.75it/s]
100%|██████████| 5/5 [00:00<00:00, 55.95it/s]
100%|██████████| 5/5 [00:00<00:00, 55.40it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 8.3187354075188}
{'ner': 1.7594856487182844}
{'ner': 1.2296863434337981}


100%|██████████| 5/5 [00:00<00:00, 46.69it/s]
100%|██████████| 5/5 [00:00<00:00, 48.33it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.4934664064568248}
{'ner': 2.5414500669967244}


100%|██████████| 5/5 [00:00<00:00, 52.07it/s]
100%|██████████| 5/5 [00:00<00:00, 47.46it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 4.298758522893129}
{'ner': 4.058565453931786}


100%|██████████| 5/5 [00:00<00:00, 51.92it/s]
100%|██████████| 5/5 [00:00<00:00, 47.60it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 3.6576154541775527}
{'ner': 4.216015887726538}


100%|██████████| 5/5 [00:00<00:00, 49.52it/s]
100%|██████████| 5/5 [00:00<00:00, 53.62it/s]
100%|██████████| 5/5 [00:00<00:00, 56.15it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.921226616200237}
{'ner': 2.183193249170095}
{'ner': 1.8291207628102741}


100%|██████████| 5/5 [00:00<00:00, 53.84it/s]
100%|██████████| 5/5 [00:00<00:00, 54.00it/s]
100%|██████████| 5/5 [00:00<00:00, 56.14it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.024724404635049466}
{'ner': 2.5628051216393697}
{'ner': 1.8177705013551673}


100%|██████████| 5/5 [00:00<00:00, 52.31it/s]
100%|██████████| 5/5 [00:00<00:00, 52.91it/s]
100%|██████████| 5/5 [00:00<00:00, 52.82it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 3.6859391620405604}
{'ner': 1.6385371377751732}
{'ner': 2.9945966940378703}


100%|██████████| 5/5 [00:00<00:00, 55.80it/s]
100%|██████████| 5/5 [00:00<00:00, 57.23it/s]
100%|██████████| 5/5 [00:00<00:00, 56.20it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.07049316369877971}
{'ner': 1.5584346415332144}
{'ner': 1.4434384807702219}


100%|██████████| 5/5 [00:00<00:00, 52.52it/s]
100%|██████████| 5/5 [00:00<00:00, 53.47it/s]
100%|██████████| 5/5 [00:00<00:00, 54.91it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.017897692173676913}
{'ner': 0.3953848642820563}
{'ner': 0.11564398706877652}


100%|██████████| 5/5 [00:00<00:00, 53.99it/s]
100%|██████████| 5/5 [00:00<00:00, 56.14it/s]
100%|██████████| 5/5 [00:00<00:00, 55.08it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.18298211775383774}
{'ner': 0.00013824994612001237}
{'ner': 0.010740499043206883}


100%|██████████| 5/5 [00:00<00:00, 57.02it/s]
100%|██████████| 5/5 [00:00<00:00, 56.42it/s]
100%|██████████| 5/5 [00:00<00:00, 56.11it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.0014988661679017704}
{'ner': 0.15426629803633501}
{'ner': 3.087737811107831e-06}


100%|██████████| 5/5 [00:00<00:00, 56.06it/s]
100%|██████████| 5/5 [00:00<00:00, 59.44it/s]
100%|██████████| 5/5 [00:00<00:00, 59.08it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 2.511770560154288}
{'ner': 1.073410065238915}
{'ner': 0.0003204419797838917}


100%|██████████| 5/5 [00:00<00:00, 53.37it/s]
100%|██████████| 5/5 [00:00<00:00, 51.97it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.04617603258032811}
{'ner': 0.06565462778072428}


100%|██████████| 5/5 [00:00<00:00, 48.45it/s]
100%|██████████| 5/5 [00:00<00:00, 53.21it/s]
100%|██████████| 5/5 [00:00<00:00, 50.00it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.02052200481455327}
{'ner': 0.0013659208748560718}
{'ner': 0.0008732412909232729}


100%|██████████| 5/5 [00:00<00:00, 52.45it/s]
100%|██████████| 5/5 [00:00<00:00, 55.77it/s]
100%|██████████| 5/5 [00:00<00:00, 56.91it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 3.208221034956032e-05}
{'ner': 0.15502860093631352}
{'ner': 0.00032448503192368267}


100%|██████████| 5/5 [00:00<00:00, 57.42it/s]
100%|██████████| 5/5 [00:00<00:00, 58.17it/s]
100%|██████████| 5/5 [00:00<00:00, 58.05it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 6.891939085695585e-06}
{'ner': 0.029022463752619222}
{'ner': 6.548933378516012e-05}


100%|██████████| 5/5 [00:00<00:00, 51.79it/s]
100%|██████████| 5/5 [00:00<00:00, 50.60it/s]
100%|██████████| 5/5 [00:00<00:00, 54.04it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.2163214075161855e-05}
{'ner': 3.913877903068629e-05}
{'ner': 0.016371225796938435}


100%|██████████| 5/5 [00:00<00:00, 56.35it/s]
100%|██████████| 5/5 [00:00<00:00, 53.09it/s]
100%|██████████| 5/5 [00:00<00:00, 48.67it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 4.467374379985515e-06}
{'ner': 2.628773205328309}
{'ner': 0.0018083555301403717}


100%|██████████| 5/5 [00:00<00:00, 48.15it/s]
100%|██████████| 5/5 [00:00<00:00, 49.08it/s]
100%|██████████| 5/5 [00:00<00:00, 52.61it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 6.860198520910335e-05}
{'ner': 0.0003332228447755365}
{'ner': 0.013556781940836624}


100%|██████████| 5/5 [00:00<00:00, 52.26it/s]
100%|██████████| 5/5 [00:00<00:00, 52.86it/s]
100%|██████████| 5/5 [00:00<00:00, 52.34it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 3.958656791600676e-06}
{'ner': 1.637995739664506}
{'ner': 0.0022706314665342426}


100%|██████████| 5/5 [00:00<00:00, 55.10it/s]
100%|██████████| 5/5 [00:00<00:00, 56.80it/s]
100%|██████████| 5/5 [00:00<00:00, 55.68it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 1.1037629440130691e-05}
{'ner': 0.00543476035907666}
{'ner': 1.9999990351476729}


100%|██████████| 5/5 [00:00<00:00, 55.98it/s]
100%|██████████| 5/5 [00:00<00:00, 58.11it/s]
100%|██████████| 5/5 [00:00<00:00, 54.71it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.0005703155427968076}
{'ner': 1.5258310017969973e-05}
{'ner': 0.033168765837550095}


100%|██████████| 5/5 [00:00<00:00, 46.71it/s]
100%|██████████| 5/5 [00:00<00:00, 50.07it/s]
100%|██████████| 5/5 [00:00<00:00, 50.97it/s]

{'ner': 2.0005374605335464}
{'ner': 9.283552085463762e-05}



100%|██████████| 5/5 [00:00<00:00, 51.07it/s]
100%|██████████| 5/5 [00:00<00:00, 53.56it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 5.934414175503831e-05}
{'ner': 1.5412995696521633e-05}
{'ner': 7.48118570194512e-08}


100%|██████████| 5/5 [00:00<00:00, 51.87it/s]
100%|██████████| 5/5 [00:00<00:00, 51.16it/s]
100%|██████████| 5/5 [00:00<00:00, 50.11it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

{'ner': 0.00014583715099864803}
{'ner': 1.5635071958453398e-07}
{'ner': 1.305718256473321}


100%|██████████| 5/5 [00:00<00:00, 46.36it/s]

{'ner': 0.0008932468730417904}





### Testing

In [5]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('machine learning', 'AREA'), ('artificial intelligence', 'AREA'), ('theoretical computer science', 'AREA')]
Entities [('machine learning', 'AREA'), ('systems', 'AREA')]
Entities [('Machine Learning', 'AREA'), ('AI', 'AREA'), ('Data Science', 'AREA')]
Entities [('machine learning', 'AREA'), ('systems', 'AREA')]
Entities [('Bioinformatics', 'AREA'), ('Machine learning', 'AREA')]
