# Annotate documents for named entity recognition 

Select the document to tag, the output folder to save the annotations to, and a folder to save the model information to:

In [3]:
import os
from ipyfilechooser import FileChooser

doc_loc = FileChooser()
output_loc = FileChooser()
model_loc = FileChooser()

print("Document to tag (.txt file):")
display(doc_loc)
print("Output folder to save annotations to:")
display(output_loc)
print("Output folder to save model to:")
display(model_loc)

Document to tag (.txt file):


FileChooser(path='/home/abertsch/git/ADEPTLab/annotate', filename='', title='HTML(value='', layout=Layout(disp…

Output folder to save annotations to:


FileChooser(path='/home/abertsch/git/ADEPTLab/annotate', filename='', title='HTML(value='', layout=Layout(disp…

Output folder to save model to:


FileChooser(path='/home/abertsch/git/ADEPTLab/annotate', filename='', title='HTML(value='', layout=Layout(disp…

Now run the cell below to annotate the document! You can type a number (+enter) to label an entity, or just press enter to skip any words that are not entities.

In [14]:
import os
import sys
import pickle
from colorama import Fore, Style
from string import punctuation
from IPython.display import clear_output

fps = {}
valid_inputs = {"0": "PERSON", "1": "NORP", "2": "LOC", "3": "FAC",
                "4": "ORG", "5": "GPE", "6": "EVENT", "7": "QUANTITY"}

stanford_core_tags = {"PERSON": "PERSON", "NORP": "O", "LOC": "LOCATION", "FAC": "O",
                       "ORG": "O", "GPE": "LOCATION", "EVENT": "O", "QUANTITY": "O"}

stanford_ann = ""
spacy_ann = []
fileout = ""
pos = 0

def print_tags():
    print("TAG OPTIONS: (press enter to leave untagged)")
    print("0 people, including fictional\t\t4 companies, institutions, etc.")
    print("1 nationalities, religions\t\t5 countries, cities, states")
    print("2 mountains, rivers, etc.\t\t6 events--named hurricanes, etc")
    print("3 buildings, airports, etc.\t\t7 measurements (e.g. weight, distance)")


def annotate(fp):
    file = fp.read()
    words = file.split()
    for i, word in enumerate(words):
        get_tag(i, word, words)

    fps["stanfordnlp-out"].write(stanford_ann)
    pickle.dump(spacy_ann, fps["spacy-out"])
    fps["rawtext-out"].write(fileout)

    for key in fps:
        fps[key].close()



def get_tag(i, word, words):
    clear_output(wait=True)
    print_tags()

    for j in range(i - 3, i):
        if j >= 0:
            print(words[j] + " ", end="", flush=True)

    print(f"{Fore.GREEN} <<" + word + f">> {Style.RESET_ALL}", end="", flush=True)

    for k in range(i + 1, i + 4):
        if k < len(words):
            print(" " + words[k], end="", flush=True)

    tag = input("\n\tTAG? ")
    if tag == "" or tag in valid_inputs:
        write_annotation(word, tag)

    else:
        print(f"\n{Fore.RED}Sorry, not sure what that meant. Try again.{Style.RESET_ALL}", flush=True)
        get_tag(i, word, words)

    print()
    return i

def add_spacy_ann(word, tag):
    word = word.strip(punctuation)
    if(len(spacy_ann) != 0 and spacy_ann[-1][1] == pos and spacy_ann[-1][2] == valid_inputs[tag]):
        spacy_ann.append((spacy_ann[-1][0], spacy_ann[-1][1] + 1 + len(word), valid_inputs[tag]))
        spacy_ann.pop(-2)
    else:
        spacy_ann.append((pos + 1, pos + 1 + len(word), valid_inputs[tag]))


def write_annotation(word, tag):
    global fileout, stanford_ann, spacy_ann, pos

    if tag == "":
        fileout += " " + word
        stanford_ann += word + "\t" + "O" + "\n"
    else:
        fileout += " " + word
        stanford_ann += word + "\t" + stanford_core_tags[valid_inputs[tag]] + "\n"
        add_spacy_ann(word, tag)

    pos = pos + 1 + len(word)

if __name__ == "__main__":
    filename = doc_loc.selected
    write_dir = output_loc.selected
    print(doc_loc.selected)

    fps["input"] = open(filename)
    fps["stanfordnlp-out"] = open(os.path.join(write_dir, filename.split("/")[-1].split(".")[0]+ "-stanfordnlp.tsv"), "w+")
    fps["spacy-out"] = open(os.path.join(write_dir, filename.split("/")[-1].split(".")[0]+ "-spacy.pkl"), "wb+")
    fps["rawtext-out"] = open(os.path.join(write_dir, filename.split("/")[-1].split(".")[0]+ "-rawtext.txt"), "w+")
    annotate(open(filename))


TAG OPTIONS: (press enter to leave untagged, b to go back)
0 people, including fictional		4 companies, institutions, etc.
1 nationalities, religions		5 countries, cities, states
2 mountains, rivers, etc.		6 events--named hurricanes, etc
3 buildings, airports, etc.		7 measurements (e.g. weight, distance)
Department spokesperson Jan [32m <<Smith.>> [0m
	TAG? 0



The outputs have been saved to the output folder specified, in formats suitable for training models with SpaCy or StanfordCore NLP. Run the cell below to train and output a new SpaCy model using that data.

In [5]:

def get_data():
    train_data = []
    for file in os.listdir(output_loc.selected):
        if(file.endswith('.txt')):
            text = open(os.path.join(output_loc.selected, file)).read()
            entities = pickle.load(open(os.path.join(output_loc.selected, file.rstrip('rawtext.txt') + 'spacy.pkl'), 'rb'))
            train_data.append((text, entities))
    print(train_data)
    return train_data

get_data()

[(' The Alps are the most beautiful mountains in the world, according to Swiss National Tourism Department spokesperson Jan Smith.', [(5, 9, 'LOC'), (70, 103, 'ORG'), (117, 126, 'PERSON')])]


[(' The Alps are the most beautiful mountains in the world, according to Swiss National Tourism Department spokesperson Jan Smith.',
  [(5, 9, 'LOC'), (70, 103, 'ORG'), (117, 126, 'PERSON')])]

In [6]:
#!/usr/bin/env python
# coding: utf8
"""Example of training spaCy's named entity recognizer, starting off with an
existing model or a blank model.

For more details, see the documentation:
* Training: https://spacy.io/usage/training
* NER: https://spacy.io/usage/linguistic-features#named-entities

Compatible with: spaCy v2.0.0+
Last tested with: v2.2.4
"""
from __future__ import unicode_literals, print_function

import pickle
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

def get_data():
    train_data = []
    for file in os.listdir(output_loc.selected):
        if(file.endswith('.txt')):
            print(file)
            text = open(os.path.join(output_loc.selected, file))
            print(text.read())
            entities = "" #pickle.load(open(os.path.join(output_loc.selected, file.rstrip('rawtext.txt') + 'spacy.pkl'), 'rb'))
            train_data.append((text, entities))
    print(train_data)
    return train_data


def main(entities_loc=None, model=None, output_dir=None, n_iter=100):
    output_dir= model_loc.selected
    TRAIN_DATA = get_data()

    """Load the model, set up the pipeline and train the entity recognizer."""
    nlp = spacy.load("en_core_web_lg")  # load existing spaCy model
    print("Loaded model '%s'" % model)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])


if __name__ == "__main__":
    main()

    # Expected output:
    # Entities [('Shaka Khan', 'PERSON')]
    # Tokens [('Who', '', 2), ('is', '', 2), ('Shaka', 'PERSON', 3),
    # ('Khan', 'PERSON', 1), ('?', '', 2)]
    # Entities [('London', 'LOC'), ('Berlin', 'LOC')]
    # Tokens [('I', '', 2), ('like', '', 2), ('London', 'LOC', 3),
    # ('and', '', 2), ('Berlin', 'LOC', 3), ('.', '', 2)]


example-rawtext.txt
 The Alps are the most beautiful mountains in the world, according to Swiss National Tourism Department spokesperson Jan Smith.
[(<_io.TextIOWrapper name='/home/abertsch/git/ADEPTLab/annotate/output/example-rawtext.txt' mode='r' encoding='UTF-8'>, '')]


OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

This concludes the annotating and training notebook! Find the output annotations in the output folder and the model in the model folder that were selected in cell 1.