# Named Entity Recognition with Spacy

We have used tutorial from: https://towardsdatascience.com/train-ner-with-custom-training-data-using-spacy-525ce748fab7

### Configuration

In [1]:
from __future__ import unicode_literals, print_function
from collections import defaultdict
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm

### Training

In [2]:
TRAIN_DATA = [
    ('My main research interests are in machine learning, artificial intelligence, and theoretical computer science.', {
        'entities': [(34,50, 'AREA'), (52, 75, 'AREA'), (81, 109, 'AREA')]
    }),
    ('My primary research areas are computational Biology, Bioinformatics and Machine learning.', {
        'entities': [(53,67, 'AREA'), (72,88, 'AREA')]
    }),
    ('I am interested in the intersection of machine learning and systems.', {
        'entities': [(39,55, 'AREA'), (60,77, 'AREA')]
    }),
    ('I work on developing and using Machine Learning, AI, and Data Science methods.', {
        'entities': [(31,47, 'AREA'), (49,51, 'AREA'), (57,69, 'AREA')]
    }),
    ('I am interested in the intersection of machine learning and systems.', {
        'entities': [(39,55, 'AREA'), (60,67, 'AREA')]
    }),
    ('Research Area CS Education', {
    'entities': [(14,26, 'AREA')]
    }),
    ('Research Area CS Education', {
    'entities': [(14,26, 'AREA')]
    }),
('''Research Interests Computational complexity, communication complexity, coding theory, algorithms, and combinatorics''', {
		'entities': [(18, 43, 'AREA'), (45, 69, 'AREA'), (71, 84, 'AREA'), (86, 96, 'AREA'), (102, 115, 'AREA'), ]
}),
('''Research Interests: Computer security, formal methods, automated reasoning''', {
		'entities': [(20, 37, 'AREA'), (39, 53, 'AREA'), (55, 74, 'AREA'), ]
}),
('''Research Interests: Computational Fluid Dynamics, Hydrodynamic Instabilities, Turbulence, Ocean Modeling''', {
		'entities': [(20, 48, 'AREA'), (50, 76, 'AREA'), (78, 88, 'AREA'), (90, 104, 'AREA'), ]
}),
('''Research Interests Signal Processing for wireless communications Computational statistics Statistical signal processing Information theory Bioinformatics''', {
		'entities': [(19, 36, 'AREA'), (65, 89, 'AREA'), (90, 119, 'AREA'), (120, 138, 'AREA'), (139, 153, 'AREA'), ]
}),
('''Research Interests Electronic Design Automation (EDA)Layout optimization Logic synthesis Programmable devices and timing optimization Bridging optimization''', {
		'entities': [(19, 47, 'AREA'), (54, 72, 'AREA'), (73, 88, 'AREA'), (89, 109, 'AREA'), (114, 133, 'AREA'), ]
}),
('''Research interests: Distributed computing; Analysis of algorithms; Data structures; Computational geometry; Graph algorithms''', {
		'entities': [(20, 41, 'AREA'), (43, 65, 'AREA'), (67, 82, 'AREA'), (84, 106, 'AREA'), (108, 124, 'AREA'), ]
}),
('''My research interests are in the general areas of Computer Networking, Information Security, and Distributed Systems.''', {
		'entities': [(50, 69, 'AREA'), (71, 91, 'AREA'), (97, 116, 'AREA'), ]
}),
('''Research Interests Computer Graphics HCI Multimedia Systems Scientific Visualization''', {
		'entities': [(19, 36, 'AREA'), (37, 40, 'AREA'), (41, 59, 'AREA'), ]
}),
('''Research Interests Open source computing, scientometrics, digital libraries.''', {
		'entities': [(19, 40, 'AREA'), (42, 56, 'AREA'), (58, 75, 'AREA'), ]
}),
('''Research Interests: Software Engineering; Software Testing; Software Security;''', {
		'entities': [(20, 40, 'AREA'), (42, 58, 'AREA'), (60, 77, 'AREA')]
}),
('''My own interests are in Software Engineering of Large Systems, Open Source Software, new and innovative Computer Programming languages.''', {
		'entities': [(25, 44, 'AREA'), (63, 83, 'AREA'), (93, 134, 'AREA'), ]
}),
('''Areas of Interest Neuromorphic Computing Machine Learning Image and Pattern Recognition Education''', {
		'entities': [(18, 40, 'AREA'), (41, 57, 'AREA'), (68, 87, 'AREA'), ]
}),
('''His research interests and activities are in real-time and safety-critical system design, scheduling theory, resource allocation and sharing in distributed computing environments, and algorithm design and analysis.''', {
		'entities': [(75, 88, 'AREA'), (144, 178, 'AREA'), (184, 200, 'AREA'), (205, 213, 'AREA'), ]
}),
('''Jonathan's research interests center around computer networking and computer security''', {
		'entities': [(44, 62, 'AREA'), (68, 84, 'AREA'), ]
}),
('''Research Interests: Computer security, network security''', {
		'entities': [(20, 37, 'AREA'), (39, 55, 'AREA'), ]
}),
('''His primary research interests are database systems, object-oriented systems and software engineering.''', {
		'entities': [(35, 51, 'AREA'), (53, 76, 'AREA'), (81, 101, 'AREA'), ]
}),
('''Research Interests My major areas of research are in computational fluid dynamics, adaptive methods for the numerical solution of pdes in complex geometries, and large-scale parallel computing.''', {
		'entities': [(53, 81, 'AREA'), (83, 99, 'AREA'), (162, 192, 'AREA'), ]
}),
('''Research interests: Hidden web search and data integration; information systems design; database design; knowledge representation.''', {
		'entities': [(20, 37, 'AREA'), (60, 86, 'AREA'), (88, 103, 'AREA'), (105, 129, 'AREA'), ]
}),
]

### Faculty dataset

In [3]:
# There are 6525 total number of faculty files
faculty_dataset = defaultdict(str)
for i in range(6525):
    path = "./data/compiled_bios/" + str(i) +".txt"
    f = open(path, "r")    
    for sen in f:
        faculty_dataset[i] = sen.strip()

In [4]:
len(faculty_dataset)

6525

### Setup Language Model

In [5]:
model = None
output_dir=Path("./")
n_iter=200

#load the model
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")
    
#set up the pipeline
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner, last=True)
else:
    ner = nlp.get_pipe('ner')

Created blank 'en' model


### Training

In [6]:
for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in tqdm(TRAIN_DATA):
            nlp.update(
                [text],  
                [annotations],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 25/25 [00:00<00:00, 49.46it/s]
 20%|██        | 5/25 [00:00<00:00, 35.70it/s]

{'ner': 174.74414750328287}


100%|██████████| 25/25 [00:00<00:00, 33.63it/s]
 16%|█▌        | 4/25 [00:00<00:00, 32.60it/s]

{'ner': 117.4512726413538}


100%|██████████| 25/25 [00:00<00:00, 30.31it/s]
 12%|█▏        | 3/25 [00:00<00:00, 29.21it/s]

{'ner': 111.75985016758915}


100%|██████████| 25/25 [00:00<00:00, 25.46it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.45it/s]

{'ner': 145.27605017136784}


100%|██████████| 25/25 [00:00<00:00, 32.61it/s]
 16%|█▌        | 4/25 [00:00<00:00, 36.06it/s]

{'ner': 112.15782336405319}


100%|██████████| 25/25 [00:00<00:00, 45.35it/s]
 24%|██▍       | 6/25 [00:00<00:00, 58.88it/s]

{'ner': 104.72852374201275}


100%|██████████| 25/25 [00:00<00:00, 52.38it/s]
 24%|██▍       | 6/25 [00:00<00:00, 55.56it/s]

{'ner': 79.51144934447498}


100%|██████████| 25/25 [00:00<00:00, 51.10it/s]
 24%|██▍       | 6/25 [00:00<00:00, 52.24it/s]

{'ner': 73.7747269131336}


100%|██████████| 25/25 [00:00<00:00, 51.97it/s]
 24%|██▍       | 6/25 [00:00<00:00, 52.30it/s]

{'ner': 66.18615335440523}


100%|██████████| 25/25 [00:00<00:00, 51.26it/s]
 24%|██▍       | 6/25 [00:00<00:00, 52.23it/s]

{'ner': 63.5707087267987}


100%|██████████| 25/25 [00:00<00:00, 53.67it/s]
 24%|██▍       | 6/25 [00:00<00:00, 55.00it/s]

{'ner': 62.88324086266915}


100%|██████████| 25/25 [00:00<00:00, 52.83it/s]
 24%|██▍       | 6/25 [00:00<00:00, 55.34it/s]

{'ner': 59.801179284918014}


100%|██████████| 25/25 [00:00<00:00, 53.55it/s]
 24%|██▍       | 6/25 [00:00<00:00, 53.32it/s]

{'ner': 58.54956334304663}


100%|██████████| 25/25 [00:00<00:00, 52.71it/s]
 24%|██▍       | 6/25 [00:00<00:00, 51.23it/s]

{'ner': 52.443927719207956}


100%|██████████| 25/25 [00:00<00:00, 52.96it/s]
 28%|██▊       | 7/25 [00:00<00:00, 60.38it/s]

{'ner': 58.02494804126446}


100%|██████████| 25/25 [00:00<00:00, 52.75it/s]
 24%|██▍       | 6/25 [00:00<00:00, 51.79it/s]

{'ner': 60.15957943700107}


100%|██████████| 25/25 [00:00<00:00, 53.84it/s]
 24%|██▍       | 6/25 [00:00<00:00, 53.00it/s]

{'ner': 57.19294644421494}


100%|██████████| 25/25 [00:00<00:00, 54.06it/s]
 20%|██        | 5/25 [00:00<00:00, 48.79it/s]

{'ner': 46.29298858229833}


100%|██████████| 25/25 [00:00<00:00, 53.97it/s]
 28%|██▊       | 7/25 [00:00<00:00, 61.92it/s]

{'ner': 41.11312797858794}


100%|██████████| 25/25 [00:00<00:00, 53.81it/s]
 24%|██▍       | 6/25 [00:00<00:00, 59.15it/s]

{'ner': 37.04578273484039}


100%|██████████| 25/25 [00:00<00:00, 53.15it/s]
 24%|██▍       | 6/25 [00:00<00:00, 54.10it/s]

{'ner': 33.54154717622292}


100%|██████████| 25/25 [00:00<00:00, 54.38it/s]
 28%|██▊       | 7/25 [00:00<00:00, 62.23it/s]

{'ner': 32.42254489135947}


100%|██████████| 25/25 [00:00<00:00, 53.44it/s]
 24%|██▍       | 6/25 [00:00<00:00, 57.38it/s]

{'ner': 29.962366734519623}


100%|██████████| 25/25 [00:00<00:00, 53.69it/s]
 24%|██▍       | 6/25 [00:00<00:00, 52.91it/s]

{'ner': 24.299875209643268}


100%|██████████| 25/25 [00:00<00:00, 56.11it/s]
 28%|██▊       | 7/25 [00:00<00:00, 60.17it/s]

{'ner': 28.212013076006315}


100%|██████████| 25/25 [00:00<00:00, 55.63it/s]
 24%|██▍       | 6/25 [00:00<00:00, 52.75it/s]

{'ner': 24.953714658116784}


100%|██████████| 25/25 [00:00<00:00, 54.37it/s]
 24%|██▍       | 6/25 [00:00<00:00, 58.16it/s]

{'ner': 17.096934121166065}


100%|██████████| 25/25 [00:00<00:00, 54.28it/s]
 20%|██        | 5/25 [00:00<00:00, 47.34it/s]

{'ner': 27.366158798633073}


100%|██████████| 25/25 [00:00<00:00, 52.71it/s]
 24%|██▍       | 6/25 [00:00<00:00, 49.42it/s]

{'ner': 17.385198809369573}


100%|██████████| 25/25 [00:00<00:00, 52.08it/s]
 24%|██▍       | 6/25 [00:00<00:00, 56.35it/s]

{'ner': 19.139336645857746}


100%|██████████| 25/25 [00:00<00:00, 49.24it/s]
 20%|██        | 5/25 [00:00<00:00, 49.79it/s]

{'ner': 13.197355572560383}


100%|██████████| 25/25 [00:00<00:00, 47.95it/s]
 24%|██▍       | 6/25 [00:00<00:00, 51.68it/s]

{'ner': 16.28188947140616}


100%|██████████| 25/25 [00:00<00:00, 45.81it/s]
 20%|██        | 5/25 [00:00<00:00, 44.45it/s]

{'ner': 10.152965028468877}


100%|██████████| 25/25 [00:00<00:00, 43.65it/s]
 20%|██        | 5/25 [00:00<00:00, 42.21it/s]

{'ner': 13.34181170552732}


100%|██████████| 25/25 [00:00<00:00, 42.40it/s]
 20%|██        | 5/25 [00:00<00:00, 43.45it/s]

{'ner': 15.052932354967343}


100%|██████████| 25/25 [00:00<00:00, 37.92it/s]
 16%|█▌        | 4/25 [00:00<00:00, 37.95it/s]

{'ner': 11.184935372227558}


100%|██████████| 25/25 [00:01<00:00, 23.15it/s]
 12%|█▏        | 3/25 [00:00<00:00, 28.97it/s]

{'ner': 25.946568527973053}


100%|██████████| 25/25 [00:00<00:00, 31.14it/s]
 20%|██        | 5/25 [00:00<00:00, 42.56it/s]

{'ner': 9.150488764707497}


100%|██████████| 25/25 [00:00<00:00, 34.23it/s]
 20%|██        | 5/25 [00:00<00:00, 38.18it/s]

{'ner': 11.226298787213512}


100%|██████████| 25/25 [00:00<00:00, 31.39it/s]
 16%|█▌        | 4/25 [00:00<00:00, 37.36it/s]

{'ner': 8.281336415984985}


100%|██████████| 25/25 [00:00<00:00, 36.02it/s]
 16%|█▌        | 4/25 [00:00<00:00, 35.39it/s]

{'ner': 6.973211481699939}


100%|██████████| 25/25 [00:00<00:00, 33.33it/s]
 16%|█▌        | 4/25 [00:00<00:00, 32.86it/s]

{'ner': 10.803490952368115}


100%|██████████| 25/25 [00:00<00:00, 32.51it/s]
 16%|█▌        | 4/25 [00:00<00:00, 31.95it/s]

{'ner': 14.772432848734478}


100%|██████████| 25/25 [00:00<00:00, 29.12it/s]
 16%|█▌        | 4/25 [00:00<00:00, 30.78it/s]

{'ner': 11.868906276697187}


100%|██████████| 25/25 [00:00<00:00, 29.46it/s]
 16%|█▌        | 4/25 [00:00<00:00, 37.12it/s]

{'ner': 4.6158914497721675}


100%|██████████| 25/25 [00:00<00:00, 35.94it/s]
 16%|█▌        | 4/25 [00:00<00:00, 39.19it/s]

{'ner': 1.5475197159061702}


100%|██████████| 25/25 [00:00<00:00, 38.25it/s]
 16%|█▌        | 4/25 [00:00<00:00, 35.38it/s]

{'ner': 13.770953380851491}


100%|██████████| 25/25 [00:00<00:00, 39.71it/s]
 20%|██        | 5/25 [00:00<00:00, 47.89it/s]

{'ner': 3.963823222083132}


100%|██████████| 25/25 [00:00<00:00, 44.21it/s]
 20%|██        | 5/25 [00:00<00:00, 45.71it/s]

{'ner': 5.555752709477046}


100%|██████████| 25/25 [00:00<00:00, 50.46it/s]
 20%|██        | 5/25 [00:00<00:00, 43.57it/s]

{'ner': 2.3056373663992726}


100%|██████████| 25/25 [00:00<00:00, 50.65it/s]
 20%|██        | 5/25 [00:00<00:00, 48.63it/s]

{'ner': 3.5827558877149186}


100%|██████████| 25/25 [00:00<00:00, 50.57it/s]
 28%|██▊       | 7/25 [00:00<00:00, 57.87it/s]

{'ner': 5.66959855761505}


100%|██████████| 25/25 [00:00<00:00, 50.41it/s]
 24%|██▍       | 6/25 [00:00<00:00, 55.40it/s]

{'ner': 5.346512884665283}


100%|██████████| 25/25 [00:00<00:00, 48.84it/s]
 20%|██        | 5/25 [00:00<00:00, 46.04it/s]

{'ner': 6.039962733261694}


100%|██████████| 25/25 [00:00<00:00, 47.64it/s]
 20%|██        | 5/25 [00:00<00:00, 47.67it/s]

{'ner': 9.681547961251756}


100%|██████████| 25/25 [00:00<00:00, 42.55it/s]
 20%|██        | 5/25 [00:00<00:00, 42.76it/s]

{'ner': 1.512733030760006}


100%|██████████| 25/25 [00:00<00:00, 46.24it/s]
 20%|██        | 5/25 [00:00<00:00, 48.73it/s]

{'ner': 11.871099225216312}


100%|██████████| 25/25 [00:00<00:00, 52.20it/s]
 24%|██▍       | 6/25 [00:00<00:00, 54.47it/s]

{'ner': 2.904605573783634}


100%|██████████| 25/25 [00:00<00:00, 54.68it/s]
 44%|████▍     | 11/25 [00:00<00:00, 56.40it/s]

{'ner': 7.973992483986666}


100%|██████████| 25/25 [00:00<00:00, 55.47it/s]
 24%|██▍       | 6/25 [00:00<00:00, 49.68it/s]

{'ner': 3.7065045057519415}


100%|██████████| 25/25 [00:00<00:00, 56.51it/s]
 24%|██▍       | 6/25 [00:00<00:00, 55.85it/s]

{'ner': 3.655349950063042}


100%|██████████| 25/25 [00:00<00:00, 56.50it/s]
 16%|█▌        | 4/25 [00:00<00:00, 37.17it/s]

{'ner': 4.64226968449936}


100%|██████████| 25/25 [00:00<00:00, 46.38it/s]
 24%|██▍       | 6/25 [00:00<00:00, 56.99it/s]

{'ner': 0.9184324599490781}


100%|██████████| 25/25 [00:00<00:00, 47.10it/s]
 20%|██        | 5/25 [00:00<00:00, 46.54it/s]

{'ner': 1.9553613369333231}


100%|██████████| 25/25 [00:00<00:00, 43.87it/s]
 20%|██        | 5/25 [00:00<00:00, 44.42it/s]

{'ner': 2.7115377970886527}


100%|██████████| 25/25 [00:00<00:00, 42.85it/s]
 16%|█▌        | 4/25 [00:00<00:00, 38.51it/s]

{'ner': 3.906743427841789}


100%|██████████| 25/25 [00:00<00:00, 38.16it/s]
 16%|█▌        | 4/25 [00:00<00:00, 37.51it/s]

{'ner': 1.5058384693898097}


100%|██████████| 25/25 [00:00<00:00, 34.66it/s]
 12%|█▏        | 3/25 [00:00<00:00, 29.33it/s]

{'ner': 0.001647459273681694}


100%|██████████| 25/25 [00:00<00:00, 30.99it/s]
 12%|█▏        | 3/25 [00:00<00:00, 29.31it/s]

{'ner': 2.479888601079216}


100%|██████████| 25/25 [00:00<00:00, 28.91it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.24it/s]

{'ner': 0.06572996772928957}


100%|██████████| 25/25 [00:01<00:00, 23.62it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.52it/s]

{'ner': 3.6084745882571347}


100%|██████████| 25/25 [00:00<00:00, 27.79it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.86it/s]

{'ner': 0.03903032159088822}


100%|██████████| 25/25 [00:00<00:00, 26.66it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.19it/s]

{'ner': 1.5799271537094528}


100%|██████████| 25/25 [00:00<00:00, 26.30it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.09it/s]

{'ner': 3.9711627357884582}


100%|██████████| 25/25 [00:00<00:00, 26.04it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.75it/s]

{'ner': 6.123822095052815}


100%|██████████| 25/25 [00:00<00:00, 26.39it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.15it/s]

{'ner': 2.7432433463008365}


100%|██████████| 25/25 [00:00<00:00, 25.91it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.63it/s]

{'ner': 2.694604202283554}


100%|██████████| 25/25 [00:01<00:00, 24.84it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.08it/s]

{'ner': 6.50371977008936}


100%|██████████| 25/25 [00:00<00:00, 25.13it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.21it/s]

{'ner': 6.466435475199299}


100%|██████████| 25/25 [00:01<00:00, 23.50it/s]
  8%|▊         | 2/25 [00:00<00:01, 17.95it/s]

{'ner': 10.99540180977661}


100%|██████████| 25/25 [00:01<00:00, 20.37it/s]
 12%|█▏        | 3/25 [00:00<00:01, 20.60it/s]

{'ner': 5.429669459723381}


100%|██████████| 25/25 [00:01<00:00, 21.14it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.51it/s]

{'ner': 0.0016846781749145269}


100%|██████████| 25/25 [00:01<00:00, 23.95it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.24it/s]

{'ner': 3.845843432096642}


100%|██████████| 25/25 [00:00<00:00, 25.20it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.81it/s]

{'ner': 6.758438940466894}


100%|██████████| 25/25 [00:01<00:00, 24.65it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.30it/s]

{'ner': 2.0777288421952176}


100%|██████████| 25/25 [00:01<00:00, 24.68it/s]
 12%|█▏        | 3/25 [00:00<00:00, 29.18it/s]

{'ner': 8.367707918524204}


100%|██████████| 25/25 [00:00<00:00, 25.02it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.62it/s]

{'ner': 5.4469383291316005}


100%|██████████| 25/25 [00:00<00:00, 25.17it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.94it/s]

{'ner': 5.906387811814445}


100%|██████████| 25/25 [00:00<00:00, 25.46it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.38it/s]

{'ner': 0.000239355322690101}


100%|██████████| 25/25 [00:01<00:00, 24.75it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.64it/s]

{'ner': 3.550917594012013}


100%|██████████| 25/25 [00:00<00:00, 25.34it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.94it/s]

{'ner': 9.460100554123148}


100%|██████████| 25/25 [00:01<00:00, 24.72it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.45it/s]

{'ner': 0.007591607940571501}


100%|██████████| 25/25 [00:00<00:00, 25.33it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.21it/s]

{'ner': 2.900781218317466}


100%|██████████| 25/25 [00:00<00:00, 25.44it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.44it/s]

{'ner': 1.9932634322491272}


100%|██████████| 25/25 [00:00<00:00, 25.06it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.92it/s]

{'ner': 3.667519611674335}


100%|██████████| 25/25 [00:01<00:00, 24.85it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.45it/s]

{'ner': 14.665163878820694}


100%|██████████| 25/25 [00:01<00:00, 24.53it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.52it/s]

{'ner': 2.149015031715235}


100%|██████████| 25/25 [00:01<00:00, 24.66it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.40it/s]

{'ner': 0.003516916347329106}


100%|██████████| 25/25 [00:01<00:00, 21.52it/s]
 12%|█▏        | 3/25 [00:00<00:01, 20.00it/s]

{'ner': 5.9069659190778845}


100%|██████████| 25/25 [00:01<00:00, 20.02it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.92it/s]

{'ner': 7.085668108674828}


100%|██████████| 25/25 [00:01<00:00, 24.86it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.74it/s]

{'ner': 0.05982446468284909}


100%|██████████| 25/25 [00:00<00:00, 25.07it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.02it/s]

{'ner': 5.704330591990372}


100%|██████████| 25/25 [00:01<00:00, 20.24it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.71it/s]

{'ner': 2.007416491851671}


100%|██████████| 25/25 [00:01<00:00, 23.23it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.97it/s]

{'ner': 5.345378891202714}


100%|██████████| 25/25 [00:01<00:00, 24.46it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.21it/s]

{'ner': 2.000645544350936}


100%|██████████| 25/25 [00:00<00:00, 25.04it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.84it/s]

{'ner': 14.293255162303334}


100%|██████████| 25/25 [00:01<00:00, 24.65it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.22it/s]

{'ner': 2.0093735290058814}


100%|██████████| 25/25 [00:01<00:00, 22.56it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.01it/s]

{'ner': 0.2288713402390806}


100%|██████████| 25/25 [00:01<00:00, 24.98it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.75it/s]

{'ner': 1.7572141443818827}


100%|██████████| 25/25 [00:00<00:00, 25.22it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.58it/s]

{'ner': 5.747548835179731}


100%|██████████| 25/25 [00:01<00:00, 24.39it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.94it/s]

{'ner': 3.9609443389260064}


100%|██████████| 25/25 [00:00<00:00, 25.51it/s]
 12%|█▏        | 3/25 [00:00<00:00, 28.26it/s]

{'ner': 2.1980224103775643}


100%|██████████| 25/25 [00:00<00:00, 25.01it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.96it/s]

{'ner': 9.121662256920784}


100%|██████████| 25/25 [00:00<00:00, 25.18it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.68it/s]

{'ner': 13.252961028589162}


100%|██████████| 25/25 [00:00<00:00, 25.04it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.20it/s]

{'ner': 2.6858075902372853}


100%|██████████| 25/25 [00:00<00:00, 25.21it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.93it/s]

{'ner': 6.750551172553067}


100%|██████████| 25/25 [00:00<00:00, 25.68it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.44it/s]

{'ner': 3.2512798451849068}


100%|██████████| 25/25 [00:00<00:00, 25.20it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.75it/s]

{'ner': 2.1063378420718624}


100%|██████████| 25/25 [00:01<00:00, 21.83it/s]
  8%|▊         | 2/25 [00:00<00:01, 18.88it/s]

{'ner': 3.717182176360892}


100%|██████████| 25/25 [00:01<00:00, 24.79it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.15it/s]

{'ner': 9.872292621339545}


100%|██████████| 25/25 [00:01<00:00, 24.00it/s]
 12%|█▏        | 3/25 [00:00<00:01, 15.75it/s]

{'ner': 2.754474090677694}


100%|██████████| 25/25 [00:01<00:00, 22.13it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.56it/s]

{'ner': 8.052126170488478}


100%|██████████| 25/25 [00:01<00:00, 23.23it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.59it/s]

{'ner': 6.413993554098528}


100%|██████████| 25/25 [00:01<00:00, 23.95it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.18it/s]

{'ner': 0.0005757120181235394}


100%|██████████| 25/25 [00:01<00:00, 22.49it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.50it/s]

{'ner': 6.001289646575882}


100%|██████████| 25/25 [00:00<00:00, 25.24it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.78it/s]

{'ner': 2.1453425960518433}


100%|██████████| 25/25 [00:00<00:00, 25.35it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.61it/s]

{'ner': 6.53944884350784}


100%|██████████| 25/25 [00:01<00:00, 24.03it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.87it/s]

{'ner': 0.0867680287311439}


100%|██████████| 25/25 [00:01<00:00, 17.96it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.91it/s]

{'ner': 0.10719593836346948}


100%|██████████| 25/25 [00:01<00:00, 23.37it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.10it/s]

{'ner': 0.03197043982982428}


100%|██████████| 25/25 [00:01<00:00, 19.47it/s]
  8%|▊         | 2/25 [00:00<00:01, 16.80it/s]

{'ner': 7.982574963502326}


100%|██████████| 25/25 [00:01<00:00, 20.08it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.87it/s]

{'ner': 3.9337062745881934}


100%|██████████| 25/25 [00:01<00:00, 22.85it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.39it/s]

{'ner': 0.5931589934629067}


100%|██████████| 25/25 [00:01<00:00, 18.57it/s]
  8%|▊         | 2/25 [00:00<00:01, 14.45it/s]

{'ner': 0.5719092913342019}


100%|██████████| 25/25 [00:01<00:00, 21.15it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.91it/s]

{'ner': 0.029984605562600363}


100%|██████████| 25/25 [00:01<00:00, 24.85it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.33it/s]

{'ner': 8.450051084238472}


100%|██████████| 25/25 [00:01<00:00, 24.96it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.07it/s]

{'ner': 1.9826148434203963}


100%|██████████| 25/25 [00:00<00:00, 25.06it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.55it/s]

{'ner': 4.178445419464211}


100%|██████████| 25/25 [00:01<00:00, 24.53it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.26it/s]

{'ner': 3.4079578067028113}


100%|██████████| 25/25 [00:00<00:00, 25.40it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.79it/s]

{'ner': 1.9976010565814828}


100%|██████████| 25/25 [00:00<00:00, 25.23it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.81it/s]

{'ner': 2.092146427003696}


100%|██████████| 25/25 [00:00<00:00, 25.29it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.74it/s]

{'ner': 0.41912066845745005}


100%|██████████| 25/25 [00:01<00:00, 24.65it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.66it/s]

{'ner': 5.114189689266784}


100%|██████████| 25/25 [00:01<00:00, 23.86it/s]
 12%|█▏        | 3/25 [00:00<00:01, 20.97it/s]

{'ner': 2.086791342562031}


100%|██████████| 25/25 [00:01<00:00, 22.52it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.27it/s]

{'ner': 1.8053784221916895}


100%|██████████| 25/25 [00:01<00:00, 24.64it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.19it/s]

{'ner': 8.411980545283463}


100%|██████████| 25/25 [00:00<00:00, 25.17it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.91it/s]

{'ner': 10.131437090173126}


100%|██████████| 25/25 [00:00<00:00, 25.05it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.09it/s]

{'ner': 0.0018411173102645183}


100%|██████████| 25/25 [00:00<00:00, 25.51it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.78it/s]

{'ner': 0.0003451399693106367}


100%|██████████| 25/25 [00:00<00:00, 25.68it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.19it/s]

{'ner': 2.7118357897067193}


100%|██████████| 25/25 [00:01<00:00, 19.79it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.77it/s]

{'ner': 0.8910868395745124}


100%|██████████| 25/25 [00:01<00:00, 19.12it/s]
  8%|▊         | 2/25 [00:00<00:01, 17.26it/s]

{'ner': 2.1275666667529003}


100%|██████████| 25/25 [00:01<00:00, 16.57it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.01it/s]

{'ner': 4.016600432633272}


100%|██████████| 25/25 [00:01<00:00, 21.85it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.41it/s]

{'ner': 2.0033819892405287}


100%|██████████| 25/25 [00:01<00:00, 20.57it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.24it/s]

{'ner': 10.796937136496195}


100%|██████████| 25/25 [00:01<00:00, 21.66it/s]
  8%|▊         | 2/25 [00:00<00:01, 19.44it/s]

{'ner': 11.384378445338726}


100%|██████████| 25/25 [00:01<00:00, 17.78it/s]
  8%|▊         | 2/25 [00:00<00:01, 19.13it/s]

{'ner': 2.011323435881563}


100%|██████████| 25/25 [00:01<00:00, 17.55it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.15it/s]

{'ner': 0.1706485864914739}


100%|██████████| 25/25 [00:01<00:00, 23.20it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.29it/s]

{'ner': 4.004175164411825}


100%|██████████| 25/25 [00:01<00:00, 24.39it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.39it/s]

{'ner': 5.499134148827474}


100%|██████████| 25/25 [00:01<00:00, 19.22it/s]
  8%|▊         | 2/25 [00:00<00:01, 16.11it/s]

{'ner': 5.603238125823008}


100%|██████████| 25/25 [00:01<00:00, 19.92it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.29it/s]

{'ner': 7.909485869963164}


100%|██████████| 25/25 [00:01<00:00, 22.38it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.14it/s]

{'ner': 13.811117136883787}


100%|██████████| 25/25 [00:01<00:00, 24.25it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.33it/s]

{'ner': 5.7827506412296}


100%|██████████| 25/25 [00:01<00:00, 21.02it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.22it/s]

{'ner': 6.444713901855347}


100%|██████████| 25/25 [00:00<00:00, 25.75it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.51it/s]

{'ner': 8.374873187630309}


100%|██████████| 25/25 [00:01<00:00, 22.60it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.89it/s]

{'ner': 10.64384543201993}


100%|██████████| 25/25 [00:00<00:00, 25.01it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.74it/s]

{'ner': 11.151687362690524}


100%|██████████| 25/25 [00:01<00:00, 20.79it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.07it/s]

{'ner': 9.1129704823102}


100%|██████████| 25/25 [00:01<00:00, 22.19it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.99it/s]

{'ner': 5.6159299036565455}


100%|██████████| 25/25 [00:01<00:00, 24.21it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.86it/s]

{'ner': 2.159342113803122}


100%|██████████| 25/25 [00:01<00:00, 23.97it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.97it/s]

{'ner': 2.0007382195517796}


100%|██████████| 25/25 [00:00<00:00, 25.55it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.60it/s]

{'ner': 0.05719131108919627}


100%|██████████| 25/25 [00:01<00:00, 24.29it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.88it/s]

{'ner': 4.081914780934985}


100%|██████████| 25/25 [00:01<00:00, 24.60it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.64it/s]

{'ner': 1.8903023938442}


100%|██████████| 25/25 [00:01<00:00, 24.40it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.33it/s]

{'ner': 8.185127069933628}


100%|██████████| 25/25 [00:00<00:00, 25.25it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.63it/s]

{'ner': 7.345249174848479}


100%|██████████| 25/25 [00:00<00:00, 25.24it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.13it/s]

{'ner': 1.9077080443439265}


100%|██████████| 25/25 [00:01<00:00, 24.15it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.45it/s]

{'ner': 1.0942038665288665}


100%|██████████| 25/25 [00:01<00:00, 22.78it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.68it/s]

{'ner': 5.2660620081042016}


100%|██████████| 25/25 [00:01<00:00, 24.58it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.55it/s]

{'ner': 5.258552757750001}


100%|██████████| 25/25 [00:01<00:00, 23.23it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.22it/s]

{'ner': 0.000294268726169007}


100%|██████████| 25/25 [00:01<00:00, 24.07it/s]
 12%|█▏        | 3/25 [00:00<00:00, 22.97it/s]

{'ner': 3.5518050226071094}


100%|██████████| 25/25 [00:01<00:00, 23.27it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.92it/s]

{'ner': 4.535449872621885}


100%|██████████| 25/25 [00:01<00:00, 23.46it/s]
 12%|█▏        | 3/25 [00:00<00:00, 23.17it/s]

{'ner': 2.6954820648044406}


100%|██████████| 25/25 [00:01<00:00, 21.09it/s]
 12%|█▏        | 3/25 [00:00<00:01, 19.60it/s]

{'ner': 8.22521112648608}


100%|██████████| 25/25 [00:01<00:00, 21.69it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.63it/s]

{'ner': 7.0798169292857684}


100%|██████████| 25/25 [00:01<00:00, 24.57it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.75it/s]

{'ner': 2.7569916816901565}


100%|██████████| 25/25 [00:00<00:00, 25.87it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.81it/s]

{'ner': 0.9665234414230565}


100%|██████████| 25/25 [00:01<00:00, 24.06it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.01it/s]

{'ner': 1.4154169889887484e-05}


100%|██████████| 25/25 [00:01<00:00, 24.98it/s]
 12%|█▏        | 3/25 [00:00<00:00, 24.36it/s]

{'ner': 1.3662663009385988e-06}


100%|██████████| 25/25 [00:01<00:00, 24.89it/s]
 12%|█▏        | 3/25 [00:00<00:00, 25.41it/s]

{'ner': 0.6496743015866207}


100%|██████████| 25/25 [00:01<00:00, 24.90it/s]
 12%|█▏        | 3/25 [00:00<00:01, 21.51it/s]

{'ner': 3.522646859586508}


100%|██████████| 25/25 [00:01<00:00, 22.74it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.18it/s]

{'ner': 9.556455864043599}


100%|██████████| 25/25 [00:01<00:00, 21.65it/s]
  8%|▊         | 2/25 [00:00<00:01, 17.46it/s]

{'ner': 1.7576182718589721}


100%|██████████| 25/25 [00:01<00:00, 19.86it/s]
  8%|▊         | 2/25 [00:00<00:01, 19.05it/s]

{'ner': 2.2783847601833593}


100%|██████████| 25/25 [00:01<00:00, 24.30it/s]
 12%|█▏        | 3/25 [00:00<00:01, 20.28it/s]

{'ner': 0.00043443274385013}


100%|██████████| 25/25 [00:01<00:00, 21.18it/s]
 12%|█▏        | 3/25 [00:00<00:00, 26.33it/s]

{'ner': 5.559146353145739}


100%|██████████| 25/25 [00:01<00:00, 24.07it/s]
 12%|█▏        | 3/25 [00:00<00:00, 27.28it/s]

{'ner': 2.294263845962511}


100%|██████████| 25/25 [00:01<00:00, 24.62it/s]

{'ner': 3.506706678258857}





### Testing (Test with training data)

In [7]:
for text, _ in TRAIN_DATA:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Distributed computing', 'AREA'), ('Analysis of algorithms', 'AREA'), ('Data structures', 'AREA'), ('Computational geometry', 'AREA'), ('Graph algorithms', 'AREA')]
Entities [('computer networking', 'AREA'), ('computer security', 'AREA')]
Entities [('CS Education', 'AREA')]
Entities [('system design', 'AREA'), ('distributed computing environments', 'AREA'), ('algorithm design', 'AREA'), ('analysis', 'AREA')]
Entities [('Computer security', 'AREA'), ('formal methods', 'AREA'), ('automated reasoning', 'AREA')]
Entities [('Hidden web search', 'AREA'), ('information systems design', 'AREA'), ('database design', 'AREA'), ('knowledge representation', 'AREA')]
Entities [('computational fluid dynamics', 'AREA'), ('adaptive methods', 'AREA'), ('large-scale parallel computing', 'AREA')]
Entities [('Computer security', 'AREA'), ('network security', 'AREA')]
Entities [('Bioinformatics', 'AREA'), ('Machine learning', 'AREA')]
Entities [('Computational Fluid Dynamics', 'AREA'), ('Hydrodyn

### Testing (Test with all data)

1. In every document, we first find a sentence that start includes "research interest, research interests, interested in"
2. We run our model on this sentence

In [8]:
matcher = spacy.matcher.Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
matcher.add("research interest", None,
            [{"LOWER": "research"}, {"LOWER": "interest"}],
            [{"LOWER": "research"}, {"LOWER": "interests"}],
            [{"LOWER": "interested"}, {"LOWER": "in"}],
            [{"LOWER": "research"}, {"LOWER": "areas"}],
            [{"LOWER": "research"}, {"LOWER": "area"}],
            [{"LOWER": "research"}, {"LOWER": "addresses"}],
           )

In [9]:
result = defaultdict(str)

In [10]:
for i, text in faculty_dataset.items():

    doc = nlp(text)
    matches = matcher(doc)
    
    if matches:
        for match_id, start, end in matches:
            span = doc[start:start+30]  # The matched span
            break
    else:
        result[i] = ''
        continue
    
    doc2 = nlp(span.text)
    entities = [(ent.text, ent.label_) for ent in doc2.ents]
    if entities:
        result[i] = entities[0][0]
    else:
        result[i] = ''

### Save result

In [11]:
import os,codecs,re

def save_result(dic_data, out_path):

    with codecs.open(out_path,'w',encoding='utf-8',errors='ignore') as f:
        for i, data in dic_data.items():
            if data:
                f.write(data+'\n')
            else:
                f.write('\n')

save_result(result,'./data/areas')