<a href="https://colab.research.google.com/github/marcelo-morales/russell-csie/blob/main/NER_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spacy NER Demo
Caveats: Entity labels are not custom. We repurposing some of the predefined ones.

Thus for our purposes:

| Predefined Label  | Corresponds to our label 
| --- | --- |
ORG | glasses
DATE | hair_color
NORP | hat_color
GPE | hat
LAW | bald

Repurposing tutorial from: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/



In [1]:
# Load pre-existing spacy model
import spacy
nlp=spacy.load('en_core_web_sm')

# Getting the pipeline component
ner=nlp.get_pipe("ner")

### Example from tutorial

In [None]:
# training data
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

### Our Training Data - Defining the entity locations using char offsets

In [2]:
# training data
## ORG - glasses
## DATE - hair_color
## NORP - hat_color
## GPE = hat
## LAW - bald
TRAIN_DATA = [
              ("Is your person wearing glasses?", {"entities": [(23,30,"ORG")]}),
              ("Do they have glasses?", {"entities": [(13,20,"ORG")]}),
              ("Does your person have glasses on?", {"entities": [(22,29,"ORG")]}),

              ("Is your person four-eyed?", {"entities": [(15,24,"ORG")]}),
              ("Is she four-eyed?", {"entities": [(7,16,"ORG")]}),

              ("Is she blond?", {"entities": [(7,12,"DATE")]}),
              ("Is your person blond-haired?", {"entities": [(15,20,"DATE")]}),
              ("Is he golden-haired?", {"entities": [(6,12,"DATE")]}),
              ("Are they gold-haired?", {"entities": [(9,13,"DATE")]}),
              ("Does your person have yellow hair?", {"entities": [(22,28,"DATE")]}),
              ("Are they auburn-haired?", {"entities": [(9,15,"DATE")]}),
              ("Is your person a ginger?", {"entities": [(17,24,"DATE")]}),              
              
              ("Is your person wearing a green hat?", {"entities": [(25,30,"NORP")]}),
              ("Are they wearing a green hat?", {"entities": [(19,24,"NORP")]}),
              ("Does your person have a green hat?", {"entities": [(24,29,"NORP")]}), 
              ("Do they have a green hat?", {"entities": [(15,20,"NORP")]}),

              ("Does your person wear a hat?", {"entities": [(24,27,"GPE")]}), 
              ("Do they wear a hat?", {"entities": [(15,18,"GPE")]}), 
              ("Do they have a hat?", {"entities": [(15,18,"GPE")]}), 

              ("Does your person not have head hair?", {"entities": [(17,35,"LAW")]}),
              ("Is your person bald?", {"entities": [(15,19,"LAW")]}),
              ("Is she bald?", {"entities": [(7,13,"LAW")]}),
              ("Is he bald?", {"entities": [(6,10,"LAW")]}),
              ("Does he not have head hair?", {"entities": [(8,26,"LAW")]}),
              ("Does he not have hair on his head?", {"entities": [(8,33,"LAW")]})                                          
              # ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]})
              ]

# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

### Training the model - Losses printed

In [3]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    print("Losses", losses)

Losses {'ner': 60.78092178907338}
Losses {'ner': 57.168306567298714}
Losses {'ner': 68.30356031383872}
Losses {'ner': 66.26109631008876}
Losses {'ner': 57.86614957496079}
Losses {'ner': 53.14728559450036}
Losses {'ner': 66.9493131988711}
Losses {'ner': 46.983277083960076}
Losses {'ner': 48.22391544936295}
Losses {'ner': 43.01519218170206}
Losses {'ner': 50.54340357534022}
Losses {'ner': 38.87983047587146}
Losses {'ner': 32.00890809057637}
Losses {'ner': 32.32324995709171}
Losses {'ner': 22.396071723321256}
Losses {'ner': 30.630119889849766}
Losses {'ner': 31.20321326529482}
Losses {'ner': 26.473462673157854}
Losses {'ner': 36.792397848896954}
Losses {'ner': 26.155820381203057}
Losses {'ner': 26.946532395624963}
Losses {'ner': 31.71347818613943}
Losses {'ner': 19.43176520472117}
Losses {'ner': 17.79637048298499}
Losses {'ner': 21.226841466851738}
Losses {'ner': 24.72095092418546}
Losses {'ner': 14.20075231124407}
Losses {'ner': 21.44121767107182}
Losses {'ner': 19.694773515577243}
Losse

### Tutorial Example

In [None]:
# Testing the model
doc = nlp("I was driving a Alto")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

from spacy import displacy

for ent in doc.ents:
	print(ent.text, ent.start_char, ent.end_char, ent.label_)

displacy.render(doc, style='ent',jupyter=True)

Entities [('Alto', 'PRODUCT')]
Alto 16 20 PRODUCT


### Testing on audio transcription input

In [5]:
## SPEECH TO TEXT TRANSCRIPTION CODE HERE

transcription = "Do they have blond hair and purple glasses?"

In [7]:
# Testing the model
doc = nlp(transcription)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

from spacy import displacy

# for ent in doc.ents:
# 	print(ent.text, ent.start_char, ent.end_char, ent.label_)

displacy.render(doc, style='ent',jupyter=True)

## ent.text ('blond') and ent.label ('DATE', which will ultimately be 'hair_color') will then be sent to the game backend to check
##
## guess_trait = ent.label
## guess_adj = ent.text
## if guess_trait == guess_adj:
##		return affirmative_response

Entities [('blond', 'DATE')]



| Predefined Label  | Corresponds to our label 
| --- | --- |
ORG | glasses
DATE | hair_color
NORP | hat_color
GPE | hat
LAW | bald

In [None]:
# Save the  model to directory
output_dir = Path('/content/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])