In [19]:
# NER: https://spacy.io/usage/linguistic-features#named-entities
# Compatible with: spaCy v2.0.0+

from __future__ import unicode_literals, print_function
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy import displacy

print(spacy.__version__)
output_dir="./spacy_modell/"

2.0.16


In [20]:
nlp = spacy.load('de_core_news_sm')
doc = nlp(u'Die Bundesregierung wechselte ihren Sitz von Bonn nach Berlin.')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Bonn 45 49 LOC
Berlin 55 61 LOC


In [21]:
displacy.render(doc, style='ent', jupyter=True)


In [22]:
displacy.render(doc, style='dep', jupyter=True)

In [23]:
doc = nlp(u'Theresa Mays Plan B könnte erneut Plan A sein, mit dem die Premierministerin in der vergangenen Woche so krachend im Unterhaus gescheitert ist: das mit Brüssel ausgehandelte Austrittsabkommen. Sie könnte jedoch eine wichtige Änderung vorschlagen: May will, nach Berichten britischer Medien, den Backstopp, die Auffanglösung für die Grenze auf der irischen Insel, noch einmal mit der EU nachverhandeln.')

displacy.render(doc, style='ent', jupyter=True)

In [24]:
# training data
TRAIN_DATA = [
    ("Die Bundesregierung wechselte ihren Sitz von Bonn nach Berlin.", {"entities": [(4, 19, "ORG")]}),
    ("London, Muenchen", {"entities": [(1, 7, "LOC"), (9, 17, "LOC")]}),
]

# n_iter=("Number of training iterations", "option", "n", int),

def main(model=None, output_dir="./spacy_modell/", n_iter=100):
    
    """Load the model, set up the pipeline and train the entity recognizer."""
    nlp = spacy.load('de_core_news_sm')

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        print("Creating NER pipe")
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")


    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
            print("**********************************")


main()

Losses {'ner': 1.2823313238209864}
Losses {'ner': 0.8512943442585552}
Losses {'ner': 0.6909180520780112}
Losses {'ner': 0.8969011462457601}
Losses {'ner': 0.94362638747993}
Losses {'ner': 1.0668665626032874}
Losses {'ner': 0.5209200885820451}
Losses {'ner': 0.7192603515466358}
Losses {'ner': 0.5283440397939714}
Losses {'ner': 0.5899191964575144}
Losses {'ner': 0.4949422729496291}
Losses {'ner': 0.3343187272222963}
Losses {'ner': 0.041921059868400334}
Losses {'ner': 0.0043766120139139275}
Losses {'ner': 0.006066401189638526}
Losses {'ner': 0.013409836695599697}
Losses {'ner': 0.0050497797923600984}
Losses {'ner': 0.1250299009395448}
Losses {'ner': 0.01144758764985454}
Losses {'ner': 0.003626824829147779}
Losses {'ner': 0.0003537675656133388}
Losses {'ner': 0.00010325078010708921}
Losses {'ner': 8.901068444089333e-06}
Losses {'ner': 5.271787903382563e-06}
Losses {'ner': 0.00022785717742124234}
Losses {'ner': 5.1459612293910245e-06}
Losses {'ner': 5.253432943160691e-07}
Losses {'ner': 3.6

In [25]:
nlp = spacy.load(output_dir)
doc = nlp(u'Hannover ist die schönste Stadt in Niedersachsen.')

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Hannover 0 8 LOC


In [26]:
displacy.render(doc, style='ent', jupyter=True)

In [27]:
doc = nlp(u'Die Bundesregierung wechselte ihren Sitz von Bonn nach Berlin.')
displacy.render(doc, style='ent', jupyter=True)

In [28]:
options = {'compact': True, 'color': 'blue'}
displacy.render(doc, style='dep', jupyter=True, options=options)

In [29]:
doc = nlp(u'Theresa Mays Plan B könnte erneut Plan A sein, mit dem die Premierministerin in der vergangenen Woche so krachend im Unterhaus gescheitert ist: das mit Brüssel ausgehandelte Austrittsabkommen. Sie könnte jedoch eine wichtige Änderung vorschlagen: May will, nach Berichten britischer Medien, den Backstopp, die Auffanglösung für die Grenze auf der irischen Insel, noch einmal mit der EU nachverhandeln.')

displacy.render(doc, style='ent', jupyter=True)