<a href="https://colab.research.google.com/github/marcelo-morales/russell-csie/blob/main/Updated_NER_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spacy NER Demo
Caveats: Entity labels are not custom. We repurposing some of the predefined ones.

Thus for our purposes:

| Predefined Label  | Corresponds to our label 
| --- | --- |
ORG | glasses
DATE | hair_color
NORP | hat_color
GPE | hat
LAW | bald

Repurposing tutorial from: https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/



In [12]:
# # Load pre-existing spacy model
# import spacy
# nlp=spacy.load('en_core_web_sm')

# # Getting the pipeline component
# ner=nlp.get_pipe("ner")


# if model is not None:
#     nlp = spacy.load(model)  # load existing spacy model
#     print("Loaded model '%s'" % model)
# else:
nlp = spacy.blank('en')  # create blank Language class
print("Created blank 'en' model")
if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe(ner)
    print("created ner")
else:
    ner = nlp.get_pipe('ner')

Created blank 'en' model
created ner


### Example from tutorial

In [None]:
# training data
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

### Our Training Data - Defining the entity locations using char offsets

In [17]:
# training data
## ORG - glasses
## DATE - hair_color
## NORP - hat_color
## GPE = hat
## LAW - bald
TRAIN_DATA = [
              ("Is your person wearing glasses?", {"entities": [(23,30,"glasses")]}),
              ("Do they have glasses?", {"entities": [(13,20,"glasses")]}),
              ("Does your person have glasses on?", {"entities": [(22,29,"glasses")]}),

              ("Is your person four-eyed?", {"entities": [(15,24,"glasses")]}),
              ("Is she four-eyed?", {"entities": [(7,16,"glasses")]}),

              ("Is she blond?", {"entities": [(7,12,"hair_color")]}),
              ("Is your person blond-haired?", {"entities": [(15,20,"hair_color")]}),
              ("Is he golden-haired?", {"entities": [(6,12,"hair_color")]}),
              ("Are they gold-haired?", {"entities": [(9,13,"hair_color")]}),
              ("Does your person have yellow hair?", {"entities": [(22,28,"hair_color")]}),
              ("Are they auburn-haired?", {"entities": [(9,15,"hair_color")]}),
              ("Is your person a ginger?", {"entities": [(17,24,"hair_color")]}),              
              
              ("Is your person wearing a green hat?", {"entities": [(25,30,"hat_color")]}),
              ("Are they wearing a green hat?", {"entities": [(19,24,"hat_color")]}),
              ("Does your person have a green hat?", {"entities": [(24,29,"hat_color")]}), 
              ("Do they have a green hat?", {"entities": [(15,20,"hat_color")]}),

              ("Does your person wear a hat?", {"entities": [(24,27,"hat")]}), 
              ("Do they wear a hat?", {"entities": [(15,18,"hat")]}), 
              ("Do they have a hat?", {"entities": [(15,18,"hat")]}), 

              ("Does your person not have head hair?", {"entities": [(17,35,"bald")]}),
              ("Is your person bald?", {"entities": [(15,19,"bald")]}),
              ("Is she bald?", {"entities": [(7,13,"bald")]}),
              ("Is he bald?", {"entities": [(6,10,"bald")]}),
              ("Does he not have head hair?", {"entities": [(8,26,"bald")]}),
              ("Does he not have hair on his head?", {"entities": [(8,33,"bald")]})                                          
              # ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]})
              ]

# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])
    # print("adding label")

# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

### Training the model - Losses printed

In [18]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

optimizer = nlp.begin_training()

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    print("Losses", losses)

Losses {'ner': 116.55293642322567}
Losses {'ner': 109.071293592453}
Losses {'ner': 84.44517185614677}
Losses {'ner': 84.12408847742176}
Losses {'ner': 93.11449831812318}
Losses {'ner': 94.15630834626356}
Losses {'ner': 89.92942654829511}
Losses {'ner': 98.1499682540962}
Losses {'ner': 103.59412354729614}
Losses {'ner': 96.90781155463628}
Losses {'ner': 85.2396923679725}
Losses {'ner': 83.30346892462383}
Losses {'ner': 83.66889442287788}
Losses {'ner': 96.42794564292058}
Losses {'ner': 99.35342358176422}
Losses {'ner': 85.59801035095006}
Losses {'ner': 86.15292392495667}
Losses {'ner': 89.57440308530158}
Losses {'ner': 92.24757583745647}
Losses {'ner': 94.03014397621155}
Losses {'ner': 97.00736502019572}
Losses {'ner': 81.8951233083732}
Losses {'ner': 82.82368475520798}
Losses {'ner': 81.91015132970522}
Losses {'ner': 89.84439977835905}
Losses {'ner': 83.46697818202665}
Losses {'ner': 98.10295683896561}
Losses {'ner': 87.71244052785372}
Losses {'ner': 89.16375387860899}
Losses {'ner': 8

### Tutorial Example

In [None]:
# Testing the model
doc = nlp("I was driving a Alto")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

from spacy import displacy

for ent in doc.ents:
	print(ent.text, ent.start_char, ent.end_char, ent.label_)

displacy.render(doc, style='ent',jupyter=True)

Entities [('Alto', 'PRODUCT')]
Alto 16 20 PRODUCT


### Testing on audio transcription input

In [None]:
#pip install PyAudio-0.2.11-cp38-cp38-win_amd64.whl


/bin/bash: py: command not found


In [None]:
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg


Reading package lists... Done
Building dependency tree       
Reading state information... Done
libportaudio2 is already the newest version (19.6.0-1).
libportaudiocpp0 is already the newest version (19.6.0-1).
portaudio19-dev is already the newest version (19.6.0-1).
libasound2-dev is already the newest version (1.1.3-5ubuntu0.6).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.


In [None]:
!pip install pyaudio
!pip install SpeechRecognition




In [None]:
## SPEECH TO TEXT TRANSCRIPTION CODE HERE
import speech_recognition as sr
import time 
#from gensim.parsing.preprocessing import remove_stopwords
#look into finding a way to remove stop words without anaconda, installation issues
#without using filtering out words library, i still dont catch umms and filler words, so 
#can just keep it like this?

#using speech_recognition library
#tutorial: https://realpython.com/python-speech-recognition/

#input as a string
#microphone low
def recognize_speech(recognizer, microphone):
    if not isinstance(recognizer, sr.Recognizer):
        raise TypeError("`recognizer` must be `Recognizer` instance")

    if not isinstance(microphone, sr.Microphone):
        raise TypeError("`microphone` must be `Microphone` instance")

    # adjust the recognizer sensitivity to ambient noise and record audio
    # from the microphone
    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    # set up the response object
    response = {
        "success": True,
        "error": None,
        "transcription": None
    }

    # try recognizing the speech in the recording
    # if a RequestError or UnknownValueError exception is caught,
    #     update the response object accordingly
    try:
        response["transcription"] = recognizer.recognize_google(audio)
    except sr.RequestError:
        # API was unreachable or unresponsive
        response["success"] = False
        response["error"] = "API unavailable"
    except sr.UnknownValueError:
        # speech was unintelligible
        response["error"] = "Unable to recognize speech"

    return response


if __name__ == "__main__":
    recognizer = sr.Recognizer()
    microphone = sr.Microphone()

    print("these are the stopwords i will use \n")
   # print(stopwords.words('english'))
    #words_to_filter = set(stopwords.words('english'))
    
    

    instruction = "ask me question based on a specific attribute for my character"
    print(instruction)
    time.sleep(1)

    PROMPT_LIMIT = 1 #number of times a user is allowed to speak to microphone

    for i in range(PROMPT_LIMIT):
        response_from_user = recognize_speech(recognizer, microphone)
        
        if not response_from_user["success"]:
            break
        print("I didn't catch that. What did you say?\n")

    print("You said: {}".format(response_from_user["transcription"]))

    # word_tokens = word_tokenize(response_from_user["transcription"])

    # filtered_sentence = [w for w in word_tokens if not w.lower() in words_to_filter]
 
    # filtered_sentence = []

    # for w in word_tokens:
    #     if w not in words_to_filter:
    #         filtered_sentence.append(w)

    #print("after filtering out words we dont need, you said " + str(filtered_sentence))

    print("these are all the microphone inputs I can find " + str(sr.Microphone.list_microphone_names()))


## Test Model

In [21]:
transcription = "Do they have blond hair and red glasses?"

# Testing the model
doc = nlp(transcription)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

from spacy import displacy

# for ent in doc.ents:
# 	print(ent.text, ent.start_char, ent.end_char, ent.label_)

displacy.render(doc, style='ent',jupyter=True)

## ent.text ('blond') and ent.label ('DATE', which will ultimately be 'hair_color') will then be sent to the game backend to check
##
## guess_trait = ent.label
## guess_adj = ent.text
## if guess_trait == guess_adj:
##		return affirmative_response

Entities [('blond', 'hair_color')]



| Predefined Label  | Corresponds to our label 
| --- | --- |
ORG | glasses
DATE | hair_color
NORP | hat_color
GPE | hat
LAW | bald

In [None]:
# Save the  model to directory
output_dir = Path('/content/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])