# Annotate documents for named entity recognition 

Select the document to tag and the output folder to save the annotations to.

In [None]:
from ipyfilechooser import FileChooser

doc_loc = FileChooser()
output_loc = FileChooser()

print("Document to tag (.txt file):")
display(doc_loc)
print("Output folder to save annotations to:")
display(output_loc)


Now run the cell below to annotate the document! You can type a number (+enter) to label an entity, or just press enter to skip any words that are not entities.

In [None]:
import os
import sys
import pickle
from colorama import Fore, Style
from string import punctuation
from IPython.display import clear_output

fps = {}
valid_inputs = {"0": "PERSON", "1": "NORP", "2": "LOC", "3": "FAC",
                "4": "ORG", "5": "GPE", "6": "EVENT", "7": "QUANTITY"}

stanford_core_tags = {"PERSON": "PERSON", "NORP": "O", "LOC": "LOCATION", "FAC": "O",
                       "ORG": "O", "GPE": "LOCATION", "EVENT": "O", "QUANTITY": "O"}

stanford_ann = ""
spacy_ann = []
fileout = ""
pos = 0

def print_tags():
    print("TAG OPTIONS: (press enter to leave untagged)")
    print("0 people, including fictional\t\t4 companies, institutions, etc.")
    print("1 nationalities, religions\t\t5 countries, cities, states")
    print("2 mountains, rivers, etc.\t\t6 events--named hurricanes, etc")
    print("3 buildings, airports, etc.\t\t7 measurements (e.g. weight, distance)")


def annotate(fp):
    file = fp.read()
    words = file.split()
    for i, word in enumerate(words):
        get_tag(i, word, words)

    fps["stanfordnlp-out"].write(stanford_ann)
    pickle.dump(spacy_ann, fps["spacy-out"])
    fps["rawtext-out"].write(fileout)

    for key in fps:
        fps[key].close()



def get_tag(i, word, words):
    clear_output(wait=True)
    print_tags()

    for j in range(i - 3, i):
        if j >= 0:
            print(words[j] + " ", end="", flush=True)

    print(f"{Fore.GREEN} <<" + word + f">> {Style.RESET_ALL}", end="", flush=True)

    for k in range(i + 1, i + 4):
        if k < len(words):
            print(" " + words[k], end="", flush=True)

    tag = input("\n\tTAG? ")
    if tag == "" or tag in valid_inputs:
        write_annotation(word, tag)

    else:
        print(f"\n{Fore.RED}Sorry, not sure what that meant. Try again.{Style.RESET_ALL}", flush=True)
        get_tag(i, word, words)

    print()
    return i

def add_spacy_ann(word, tag):
    word = word.strip(punctuation)
    if(len(spacy_ann) != 0 and spacy_ann[-1][1] == pos and spacy_ann[-1][2] == valid_inputs[tag]):
        spacy_ann.append((spacy_ann[-1][0], spacy_ann[-1][1] + 1 + len(word), valid_inputs[tag]))
        spacy_ann.pop(-2)
    else:
        spacy_ann.append((pos + 1, pos + 1 + len(word), valid_inputs[tag]))


def write_annotation(word, tag):
    global fileout, stanford_ann, spacy_ann, pos

    if tag == "":
        fileout += " " + word
        stanford_ann += word + "\t" + "O" + "\n"
    else:
        fileout += " " + word
        stanford_ann += word + "\t" + stanford_core_tags[valid_inputs[tag]] + "\n"
        add_spacy_ann(word, tag)

    pos = pos + 1 + len(word)

if __name__ == "__main__":
    filename = doc_loc.selected
    write_dir = output_loc.selected
    print(doc_loc.selected)

    fps["input"] = open(filename)
    fps["stanfordnlp-out"] = open(os.path.join(write_dir, filename.split("/")[-1].split(".")[0]+ "-stanfordnlp.tsv"), "w+")
    fps["spacy-out"] = open(os.path.join(write_dir, filename.split("/")[-1].split(".")[0]+ "-spacy.pkl"), "wb+")
    fps["rawtext-out"] = open(os.path.join(write_dir, filename.split("/")[-1].split(".")[0]+ "-rawtext.txt"), "w+")
    annotate(open(filename))
    clear_output(wait=True)
    print("The outputs have been saved to the output folder specified, in formats suitable for training models with SpaCy or StanfordCore NLP!")
