In [7]:
#Import all the necessary libraries and files
import spacy
import random
from spacy.tokens import Doc
from spacy.training import Example
from spacy.language import Language
from json_parser import *

In [8]:
#getting string as input returns the labeled word and the entity of that word
def print_doc_entities(_doc: Doc):
    if _doc.ents:
        for _ent in _doc.ents:
            print(f"{_ent.text} {_ent.label_}")
    else:
        print("NONE")

In [11]:
def customizing_pipeline_component(nlp: Language):
    train_data = get_data('../FOOD_entity_custom_NER/data/json_files/') #calling the train data
    disabled_pipes = []
    for pipe_name in nlp.pipe_names:
        if pipe_name != 'ner':
            nlp.disable_pipes(pipe_name)
            disabled_pipes.append(pipe_name) #disabled_pipes list holds the pipe other than 'ner', in order to train ner only
    print("entities before training the model")
    print(nlp.get_pipe("ner").labels)
    #Training Starts from here
    optimizer = nlp.create_optimizer() #optimizer initialization

    #using 30 epochs to reduce the loss and to learn better 
    for _ in range(30):
        random.shuffle(train_data)
        losses ={}
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            #resource of training via the Python API: https://spacy.io/usage/v3#migrating-training-python 
            example = Example.from_dict(doc, {"entities": entity_offsets})
            nlp.update([example], sgd=optimizer, losses=losses) #updating the en_core_web_sm nlp model with new added value for custom entity
    print("Loss", losses)

    # Enable all previously disabled pipe components
    for pipe_name in disabled_pipes:
        nlp.enable_pipe(pipe_name)

    print("entities before training the model")
    print(nlp.get_pipe("ner").labels)
    
    nlp.to_disk("../FOOD_entity_custom_NER/output/") #saving the new model to disk

    # Result after training on test data
    print(f'Test Data using the trained model')
    doc = nlp(u'Sebastian Thrun loves to eat chicken and salad at Google in 2017')
    print(doc)
    print_doc_entities(doc)

In [12]:
def main():
    nlp = spacy.load('en_core_web_sm')
    customizing_pipeline_component(nlp)

if __name__ == '__main__':
    main()

entities before training the model
('CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')
Loss {'ner': 0.6412596824195579}
entities before training the model
('CARDINAL', 'DATE', 'EVENT', 'FAC', 'FOOD', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART')
test data using the trained model
Sebastian Thrun loves to eat chicken and salad at Google in 2017
Sebastian Thrun PERSON
chicken FOOD
salad FOOD
2017 DATE
