In [25]:
# Import requirements
from __future__ import unicode_literals, print_function
import spacy
import plac
from spacy.util import minibatch, compounding
import random
import pickle
from pathlib import Path


In [26]:
# Import training data
training_data = []
file = open('training_data_all.jsonl', 'r', encoding='utf-8')
for line in file.readlines():
    line = line.rstrip('\n ",')
    line = eval(line) 
    training_data.append(line)

In [27]:
# Test text
test_text = 'Senterpartiet får en oppslutning på 22,1 prosent, opp 2,3 prosentpoeng fra november. '

In [29]:
#Set output dir and determine number of iterations.
output_dir=('output')
n_iter=(40)

# Load pre-existing spacy model.
new_model_name = 'nb_core_news_sm_ner'
nlp = spacy.load('nb_core_news_sm')
model = nlp
print("Loaded model '%s'" % model)

# Getting the pipeline component
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe('ner')

#Adding labels to the `ner`
for _, annotations in training_data:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

optimizer = nlp.entity.create_optimizer()
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

with nlp.disable_pipes(*other_pipes):  # Disable pipeline components you dont need to change
    for itn in range(n_iter): # TRAINING THE MODEL
        random.shuffle(training_data)
        losses = {}
        batches = minibatch(training_data, size=compounding(4., 32., 1.001))
        for batch in batches: # batch of texts, annotations, dropout - make it harder to memorise data
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.5, losses=losses)

             

    # Test the trained model
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # Save model 
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # Test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)

Loaded model '<spacy.lang.nb.Norwegian object at 0x7f347b117af0>'
Entities in 'Senterpartiet får en oppslutning på 22,1 prosent, opp 2,3 prosentpoeng fra november. '
POLITICAL PARTY Senterpartiet
Saved model to output
Loading from output
POLITICAL PARTY Senterpartiet
