In [1]:
import pandas as pd
from tqdm.notebook import tqdm
import random
import string
import pickle
import spacy
from spacy.util import minibatch
from spacy import displacy

In [2]:
with open("train_data_all.data", "rb") as f:
    TRAIN_DATA = pickle.load(f)

In [3]:
len(TRAIN_DATA)

229414

In [4]:
poi_counter = 0
street_counter = 0
for data in TRAIN_DATA:
    for ent in data[1]['entities']:
        if ent[2] == 'POI':
            poi_counter += 1
        else:
            street_counter += 1
poi_counter, street_counter

(66634, 196645)

### Prepare Spacy nlp model for training & testing

In [6]:
def create_blank_nlp(train_data):
    nlp = spacy.blank("en")
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return nlp

In [7]:
import datetime as dt

spacy.require_gpu(gpu_id=0)
nlp = create_blank_nlp(TRAIN_DATA)
# nlp = spacy.load("all_model-30/")
optimizer = nlp.begin_training()


### Training loop

In [8]:
for i in range(10):
    random.shuffle(TRAIN_DATA)
    losses = {}
    batches = minibatch(TRAIN_DATA, size=32)
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(texts, annotations, sgd=optimizer, drop=0.4, losses=losses)
    print(f"Losses at iteration {i} - {dt.datetime.now()}", losses)

Losses at iteration 0 - 2021-03-20 18:09:22.998696 {'ner': 188585.35436189623}
Losses at iteration 1 - 2021-03-20 18:15:56.596208 {'ner': 130561.62866021511}
Losses at iteration 2 - 2021-03-20 18:23:13.083325 {'ner': 114203.99882685066}
Losses at iteration 3 - 2021-03-20 18:30:25.839592 {'ner': 103601.73642456869}
Losses at iteration 4 - 2021-03-20 18:37:41.745370 {'ner': 97394.58751862573}
Losses at iteration 5 - 2021-03-20 18:45:01.953504 {'ner': 92032.74301381983}
Losses at iteration 6 - 2021-03-20 18:52:39.892582 {'ner': 88023.69195466167}
Losses at iteration 7 - 2021-03-20 19:00:54.360072 {'ner': 84209.73984300713}
Losses at iteration 8 - 2021-03-20 19:09:06.045027 {'ner': 80826.67344680104}
Losses at iteration 9 - 2021-03-20 19:17:20.765953 {'ner': 78659.44173078664}


### Save trained model

In [13]:
output_dir = 'all_model-2-5'
nlp.to_disk(output_dir)