In [1]:
import pandas as pd
import json 
import os
import spacy
from spacy.training import Example
from spacy import displacy
import numpy as np
import random
from tqdm import tqdm
import pickle
import warnings

In [2]:
#!python -m spacy download es_core_news_sm
#!conda install -c conda-forge spacy

# Loading the datasets

In [3]:
path_reviews = os.path.join("resources", "reviews.json")
path_entities = os.path.join("resources", "entities.json")

with open(path_reviews) as json_file:
    reviews = json.load(json_file)

with open(path_entities) as json_file:
    entities = json.load(json_file)
    
df_entities = pd.DataFrame(entities)
df_reviews = pd.DataFrame(reviews)


# Converting the info in reviews.json and entities.json into a format that is ready for training with spacy

In [4]:
df_reviews.uid.nunique()

5000

In [5]:
uids_to_drop = df_reviews[~df_reviews.uid.isin(df_entities.review_uid.unique())].uid.unique()

In [6]:
df_reviews = df_reviews[df_reviews.uid.isin(df_entities.review_uid.unique())]

In [7]:
def convert_to_training_format(review_id):
    """takes the info of a single review from reviews.json and entities.json 
    and puts it in the right training format for Spacy"""
    review = df_reviews[df_reviews.uid== review_id ]
    review_body = review["body"].loc[review.index[0]]
    entities = df_entities[df_entities.review_uid ==review_id]

    entities_list  = []
    for ind,row in entities.iterrows():
        entity_tuple = row.start,row.end,row.type
        entities_list.append(entity_tuple)
    
    return (review_body,{"entities":entities_list})

In [8]:
convert_to_training_format("000010f29b5d65ad7c073acc31e327dc3ff9af54")

('Tiene gran variedad de tapas a 2,50 de gran calidad.',
 {'entities': [(6, 10, 'modifier'),
   (23, 28, 'concept'),
   (39, 43, 'modifier')]})

In [9]:
formatted_data = [convert_to_training_format(x) for x in df_reviews.uid.unique() if x not in uids_to_drop]

In [10]:
# An example of how the formatted reviews look:
formatted_data[500]

('Comimos Maragato, y eso que no estábamos muy convencidos, porque tanto mi pareja como yo no somos de cocidos, pero la verdad es que acertamos.',
 {'entities': [(101, 108, 'concept')]})

# Splitting randomly the dataset into training and test sets

In [11]:
random.seed(42)
random.shuffle(formatted_data)
training_set = formatted_data[:3000]
validation_set = formatted_data[3000:4000]
test_set = formatted_data[4000:]

# Let us save the test_set (we will use this one in executable script), training_set and validation_set  as files

In [12]:
with open(os.path.join("resources",'test_set.data'), 'wb') as filehandle:
    pickle.dump(test_set, filehandle)
    
with open(os.path.join("resources",'training_set.data'), 'wb') as filehandle:
    pickle.dump(training_set, filehandle)

with open(os.path.join("resources",'validation_set.data'), 'wb') as filehandle:
    pickle.dump(validation_set, filehandle)

# Training with spacy CLI : first a few lines of code to convert the training and validation sets into the .spacy format needeed

In [13]:
from spacy.tokens import DocBin

nlp = spacy.blank("es") # load a new spacy model
db = DocBin() # create a DocBin object

file = open(os.path.join("resources","training_set.data"),'rb')
train_data = pickle.load(file)

for text, annot in tqdm(train_data): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk(os.path.join("resources","training_set.spacy"))


  9%|██████▉                                                                       | 265/3000 [00:00<00:04, 579.22it/s]

Skipping entity


 15%|███████████▊                                                                  | 454/3000 [00:00<00:04, 590.01it/s]

Skipping entity


 25%|███████████████████▋                                                          | 758/3000 [00:01<00:03, 669.64it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


 44%|█████████████████████████████████▋                                           | 1314/3000 [00:01<00:02, 713.48it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 51%|███████████████████████████████████████▏                                     | 1525/3000 [00:02<00:02, 543.14it/s]

Skipping entity
Skipping entity
Skipping entity


 56%|███████████████████████████████████████████▎                                 | 1687/3000 [00:02<00:03, 428.19it/s]

Skipping entity
Skipping entity


 62%|███████████████████████████████████████████████▋                             | 1860/3000 [00:03<00:02, 503.46it/s]

Skipping entity
Skipping entity


 66%|██████████████████████████████████████████████████▉                          | 1985/3000 [00:03<00:01, 551.67it/s]

Skipping entity


 77%|███████████████████████████████████████████████████████████▎                 | 2313/3000 [00:03<00:00, 689.62it/s]

Skipping entity
Skipping entity
Skipping entity


 85%|█████████████████████████████████████████████████████████████████▋           | 2559/3000 [00:04<00:00, 738.32it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity


 97%|██████████████████████████████████████████████████████████████████████████▋  | 2908/3000 [00:04<00:00, 805.87it/s]

Skipping entity


100%|█████████████████████████████████████████████████████████████████████████████| 3000/3000 [00:04<00:00, 644.52it/s]


Skipping entity


In [14]:
file = open(os.path.join("resources","validation_set.data"),'rb')
validation_data = pickle.load(file)

for text, annot in tqdm(validation_data): # data in previous format
    doc = nlp.make_doc(text) # create doc object from text
    ents = []
    for start, end, label in annot["entities"]: # add character indexes
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents # label the text with the ents
    db.add(doc)

db.to_disk(os.path.join("resources","validation_set.spacy")) # save the docbin object

 53%|█████████████████████████████████████████▎                                    | 529/1000 [00:00<00:00, 840.35it/s]

Skipping entity
Skipping entity


 86%|██████████████████████████████████████████████████████████████████▉           | 858/1000 [00:01<00:00, 650.70it/s]

Skipping entity


100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 700.85it/s]


In [15]:
model_path_cli = os.path.join("model_cli","model-best")   
if os.path.isdir(model_path_cli):
    print("trained_model already exists")
    pass
else:
    # Train the model via the Command Line Interface of spacy using the config.cfg file, 
    # the trained model is put in the model_cli folder.
    os.system("python -m spacy train config.cfg --output ./model_cli " 
              +"--paths.train resources/training_set.spacy " 
              +"--paths.dev resources/validation_set.spacy")

trained_model already exists


# Training loop in Python: Alternatively one can train the model by looping in Python 

In [16]:
# path where trained model is saved
model_path_python = os.path.join("model_python","ner")
n_epochs     = 10

In [17]:
nlp = spacy.blank('es')  
print("Created blank 'es' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe('ner')
    
ner = nlp.create_pipe('ner')

Created blank 'es' model


In [18]:
for _, annotations in training_set:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [19]:
if os.path.isdir(model_path_python):
    print("trained_model already exists")
    pass
else:
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER model
        optimizer = nlp.begin_training()
        warnings.filterwarnings("ignore", category=UserWarning, module='spacy')
        for epoch in range(n_epochs):
            random.shuffle(training_set)
            losses = {}
            for text, annotations in tqdm(training_set):
                example = Example.from_dict(nlp.make_doc(text), annotations)
                nlp.update(
                    [example],  
                    drop=0.5,  
                    sgd=optimizer,
                    losses=losses)
            print(losses["ner"]/len(training_set))
        
    nlp.to_disk(model_path_python)

trained_model already exists


# Let us evaluate the performance of the NER model based on three metrics: precission, recall and f-score

In [20]:
model_path = os.path.join("model_cli","model-best")
ner_model = spacy.load(model_path)

# model evaluation via the CLI
!python -m spacy evaluate model_cli/model-best resources/validation_set.spacy

[i] Using CPU
[1m

TOK     100.00
NER P   94.57 
NER R   95.31 
NER F   94.94 
SPEED   13342 

[1m

               P       R       F
modifier   95.09   97.48   96.27
concept    94.17   93.71   93.94



# Let's see how the NER model trained via spacy's CLI works on the example review given in the test instructions:

In [21]:
review = "La paella de marisco era bastante cara, pero el servicio fue excelente."
doc = ner_model(review)
spacy.displacy.render(doc,style="ent",jupyter=True)


# A few more examples of how the model works on unseen reviews from the test set

In [22]:
for text, _ in test_set[520:540]:
    doc = ner_model(text)
    spacy.displacy.render(doc,style="ent",jupyter=True)
    print("===================================================================================================================")









































# Let's see now the model that was trained directly in python. In this case spacy only provides the NER loss on the training set; and it's somewhat more difficult because we lack built-in metrics of model performance unlike the model trained via command-line interface

In [23]:
ner_model = spacy.load(model_path_python)

# In the loop below I compare the predictions of the model to the labels of the entities in the test dataset to measure the accuracy score of the model in this case.

In [24]:
correct_predictions = 0
total_entities = 0 
for i in range(len(test_set)):
    example = test_set[i]
    review_text = example[0]
    doc = ner_model(review_text)
    entities_list_predicted= [(ent.text, ent.label_) for ent in doc.ents]
    
    spans = example[1]["entities"]
    entities_list = list()
    for t in spans:
        start = t[0]
        end   = t[1]
        label = t[2]
        tup = (review_text[start:end],t[2])
        entities_list.append(tup)
    
    entities_set_predicted = set(entities_list_predicted)
    entities_set = set(entities_list)
    correct_predictions += len(entities_set.intersection(entities_set_predicted))   
    total_entities += len(entities_set)
    

In [25]:
accuracy_score = 100*round(correct_predictions/total_entities,4)

In [26]:
accuracy_score

93.08999999999999

# Let's try with the example given in the test instructions

In [27]:
review_try = "La paella de marisco era bastante cara, pero el servicio fue excelente."
doc = ner_model(review_try)
spacy.displacy.render(doc,style="ent",jupyter=True)

# Let's see how it works on some of the reviews from the test set that was held out during the training phase (The algorithm has not previously seen these reviews)

In [28]:
for text, _ in test_set[520:540]:
    doc = ner_model(text)
    spacy.displacy.render(doc,style="ent",jupyter=True)
    print("===================================================================================================================")









































# 1.2. Does the model have the capacity to find new entities that aren’t present on the training set? If so, are they from the domain? Otherwise, what would you do to improve that ability?

In [29]:
training_entities = df_entities.term.unique()

In [30]:
"casco" in training_entities

False

In [31]:
"bicicleta" in training_entities

False

In [32]:
"avión" in training_entities

False

In [33]:
review_try = "El casco de la bicicleta fue de gran calidad"
doc = ner_model(review_try)
spacy.displacy.render(doc,style="ent",jupyter=True)

In [34]:
review_try = "El avión que tenía mi padre cuando era joven era muy rápido"
doc = ner_model(review_try)
spacy.displacy.render(doc,style="ent",jupyter=True)

I show some examples of sentences that are not related to restaurant reviews that use concepts that are not present in the training set ; such as "el casco de la bicicleta fue de gran calidad", and the model correctly recognizes "casco" and "bicicleta" as "concepts" and "gran" as a modifier, although "casco" and "bicicleta" are not found in the entities.json . Another example was the sentence  "El avión que tenía mi padre cuando era joven era muy rápido", in which the modifiers "joven" and "muy rápido" were recognized but the concept "avión" (again non-existent in the training set) was not recognized by the algorithm. So, the model does have some limited capacity to find new entities that were not present in the training set 

In [35]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.
