In [None]:
# NLP Lecture @ Strive School - 21st July 2021
# NER update

'''
Since today we are exploring the world of natural language processing, we’ll deepen in the Named Entity Recognition technique: this is just one of the mechanisms that NLP embodies. The recognition of named entities as the process of automatic identification of the entities present in a text and consequent classification into predefined categories such as "person", "organization", "position" is a quite common activity and expect for English, trained models with spaCy offer few labels that could be improved through training.

Following the case study of this morning, try to emulate it in order to label all the brands present in the provided datasets, choosing the one you prefer OR trying to label all them and to train the model to recognize new different entities. The result should be twofold: the final model should be able to recognize brands that it has already seen, but already new ones.
The brands proposed in the dataset concern fashion, cars and food.
In order to test the accuracy of the model, test it with sentences and brands the model has never seen.

Sample of the dataset
---------------------
- Cate Blanchett in Armani Privé. Rating: 8. Concludes as a rare butterfly, or from Rorschach's Test, or from computerized axial tomography.
- I liked everything, recommend it! Another quality Xiaomi product...
- What is the price of that Fiat 500XL?

Info:
- Feel free to change or arrange a new dataset
- Try experimenting and tuning with the hyperparameters
- Feel free to use or change the code you've seen during the morning session
- TBD = To be done (from you!) :)

'''

In [4]:
# STEP 0 - PRE REQUISITES

# python -m spacy download en_core_web_lg

# TBD: Import libraries
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example

# TBD: Load preferred model
nlp= spacy.load('en_core_web_lg')

with open("food.txt") as file:
    dataset = file.read()

# TBD: Load the dataset and test it as-is

In [5]:
doc = nlp(dataset)
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])

Entities: [('Italian', 'NORP'), ('first', 'ORDINAL'), ('Arrosticini', 'PERSON'), ('Alfredo', 'PERSON'), ('Naples', 'GPE'), ('Zaza', 'PERSON'), ('ApplePie', 'ORG'), ('Bologna', 'GPE'), ('Fiorentina Steak', 'PERSON'), ('Pineapple', 'ORG'), ('First', 'ORDINAL'), ('Bronte', 'PERSON'), ('Coca-cola', 'ORG'), ('Fanta', 'ORG'), ('Pepsi', 'ORG'), ('One', 'CARDINAL'), ('Sorrento', 'GPE'), ('Coffee\nBread', 'ORG'), ('Love', 'WORK_OF_ART'), ('Fatte', 'PERSON'), ('vedrai che il mondo poi ti', 'PERSON'), ('Pastiera', 'PERSON'), ('the United States', 'GPE'), ('Two', 'CARDINAL'), ('two', 'CARDINAL'), ('24 hours', 'TIME'), ('24', 'CARDINAL'), ('two hours', 'TIME')]


In [6]:
# STEP 1 - TRAIN DATA

# Prepare training data

# TBD: define all the entities by extracting the words and their indexes from the dataset
# expected format is the following:  ("sentence", {"entities": [0,10, "FOOD"]})


words = ['Ketchup','pasta','carrot','pizza','garlic','tomato sauce','basil',
         'carbonara','eggs','cheek fat','pan cakes','parmigiana','eggplant',
        'fettucine','heavy cream','polenta','risotto','espresso','arrosticini','spaghetti',
         'fiorentina steak','pecorino','maccherone',
        'neutella','amero','pistachio','coca-cola','wine','pastiera','watermelon','cappuccino',
        'ice cream','soup','lemon','chocolate',"pineapple"]

train_data = []

with open('food.txt') as file:
    dataset = file.readlines()
    for sentence in dataset:
        print('********')
        print('sentences:', sentence)
        print('********')
        sentence= sentence.lower()
        entities= []
        for word in words:
            word= word.lower()
            if word in sentence:
                start_index= sentence.index(word)
                end_index = len(word)+ start_index
                print('word:', word)
                print('************')
                print('start_index:', start_index)
                print('end_index:', end_index)
                pos = (start_index, end_index, 'FOOD')
                entities.append(pos)
        element = (sentence.rstrip('\n'), {'entities': entities})
        train_data.append(element)
        print('********')
        print('element:', element)
        print('********')

********
sentences: Give me carrot cake.

********
word: carrot
************
start_index: 8
end_index: 14
********
element: ('give me carrot cake.', {'entities': [(8, 14, 'FOOD')]})
********
********
sentences: I love simple pizza margherita with tomato sauce and mozzarella.

********
word: pizza
************
start_index: 14
end_index: 19
word: tomato sauce
************
start_index: 36
end_index: 48
********
element: ('i love simple pizza margherita with tomato sauce and mozzarella.', {'entities': [(14, 19, 'FOOD'), (36, 48, 'FOOD')]})
********
********
sentences: I don't like pizza with garlic!

********
word: pizza
************
start_index: 13
end_index: 18
word: garlic
************
start_index: 24
end_index: 30
********
element: ("i don't like pizza with garlic!", {'entities': [(13, 18, 'FOOD'), (24, 30, 'FOOD')]})
********
********
sentences: I like pasta with homemade tomato sauce and basil.

********
word: pasta
************
start_index: 7
end_index: 12
word: tomato sauce
*******

element: ('the best pistachio comes from bronte', {'entities': [(9, 18, 'FOOD')]})
********
********
sentences: Who doesn't like chocolate has something to hide

********
word: chocolate
************
start_index: 17
end_index: 26
********
element: ("who doesn't like chocolate has something to hide", {'entities': [(17, 26, 'FOOD')]})
********
********
sentences: Coca-cola, Fanta or Pepsi?

********
word: coca-cola
************
start_index: 0
end_index: 9
********
element: ('coca-cola, fanta or pepsi?', {'entities': [(0, 9, 'FOOD')]})
********
********
sentences: If it's not soup, it's wet bread

********
word: soup
************
start_index: 12
end_index: 16
********
element: ("if it's not soup, it's wet bread", {'entities': [(12, 16, 'FOOD')]})
********
********
sentences: Garlic is good for blood pressure

********
word: garlic
************
start_index: 0
end_index: 6
********
element: ('garlic is good for blood pressure', {'entities': [(0, 6, 'FOOD')]})
********
********
sentences: Wa

In [None]:
# STEP 1 - TRAIN DATA

# Prepare training data

# TBD: define all the entities by extracting the words and their indexes from the dataset
# expected format is the following:  ("sentence", {"entities": [0,10, "FOOD"]})


words = [food=['Ketchup','pasta','carrot','pizza','garlic','tomato sauce','basil',
         'carbonara','eggs','cheek fat','pan cakes','parmigiana','eggplant',
        'fettucine','heavy cream','polenta','risotto','espresso','arrosticini','spaghetti',
         'fiorentina steak','pecorino','maccherone',
        'neutella','amero','pistachio','coca-cola','wine','pastiera','watermelon','cappuccino',
        'ice cream','soup','lemon','chocolate',"pineapple"],
         Organisation=['samsung','apple','blackberry','xaiomi','huawei','iphone','nokia','redmi','sony',
                      'lg',]
         

train_data = []

with open('food.txt') as file:
    dataset = file.readlines()
    for sentence in dataset:
        print('********')
        print('sentences:', sentence)
        print('********')
        sentence= sentence.lower()
        entities= []
        for unit in words:
         for word in unit:
                 
            word= word.lower()
            if word in sentence:
                start_index= sentence.index(word)
                end_index = len(word)+ start_index
                print('word:', word)
                print('************')
                print('start_index:', start_index)
                print('end_index:', end_index)
                pos = (start_index, end_index, 'unit')
                entities.append(pos)
        element = (sentence.rstrip('\n'), {'entities': entities})
        train_data.append(element)
        print('********')
        print('element:', element)
        print('********')

In [None]:
         
        
# STEP 2 - UPDATE MODEL

# TBD: load the needed pipeline
# ner = nlp.get_pipe('ner')
# for _, annotations in train_data:
#     for ent in annotations.get('entities'):
#         ner.add_label(ent[2])
        
# ner.add_label('FOOD')

# TBD: define the annotations

# TBD: train the model



pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]



# TBD: define the number of iterations, the batch size and the drop according to your experience or using an empirical value
# Train model
with nlp.disable_pipes(*unaffected_pipes):
    for iteration in range(30):
        print("Iteration #" + str(iteration))

        # Data shuffle for each iteration
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        for batch in spacy.util.minibatch(train_data, size=3):
            for text, annotations in batch:
                # Create an Example object
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                # Update the model
                nlp.update([example], losses=losses, drop=0.1)
        print("Losses:", losses)

# Save the model
output_dir= path('/ner/')
nlp.to_disk(output_dir)
print('saved')
# TBD:

Iteration #0


In [None]:
# STEP 3 - TEST THE UPDATED MODEL

# Load updated model
nlp_updated = spacy.load(output_dir)

# TBD: test with a old sentence
doc= nlp_updated('Alfredo did not invent any pasta!')
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])

# TBD: test with a new sentence and an old brand
doc= nlp_updated('I dont like spaghetti with mayo')
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])

# TBD: test with a new sentence and a new brand
doc= nlp_updated('burger with cheese is good')
print("entities:", [(ent.text, ent.label_) for ent in doc.ents])