In [9]:
import sys
!{sys.executable} -m pip install spacy



In [10]:
import spacy
import random
import time
import numpy as np
from spacy.util import minibatch, compounding

In [10]:
from os import path, mkdir
if not path.isdir("data/"):
    mkdir("data/")
if not path.isdir("models/"):
    mkdir("models/")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [5]:
!curl https://groups.csail.mit.edu/sls/downloads/movie/engtest.bio -o data/test.txt
!curl https://groups.csail.mit.edu/sls/downloads/movie/engtrain.bio -o data/train.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  246k  100  246k    0     0   258k      0 --:--:-- --:--:-- --:--:--  258k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  989k  100  989k    0     0  1149k      0 --:--:-- --:--:-- --:--:-- 1149k


In [11]:
def load_data_spacy(file_path):
    ''' Converts data from:
    label \t word \n label \t word \n \n label \t word
    to: sentence, {entities : [(start, end, label), (stard, end, label)]}
    '''
    file = open(file_path, 'r')
    training_data, entities, sentence, unique_labels = [], [], [], []
    current_annotation = None
    end = 0 # initialize counter to keep track of start and end characters
    for line in file:
        line = line.strip("\n").split("\t")
        # lines with len > 1 are words
        if len(line) > 1:
            label = line[0][2:]     # the .txt is formatted: label \t word, label[0:2] = label_type
            label_type = line[0][0] # beginning of annotations - "B", intermediate - "I"
            word = line[1]
            sentence.append(word)
            end += (len(word) + 1)  # length of the word + trailing space
           
            if label_type != 'I' and current_annotation:  # if at the end of an annotation
                entities.append((start, end - 2 - len(word), current_annotation))  # append the annotation
                current_annotation = None                 # reset the annotation
            if label_type == 'B':                         # if beginning new annotation
                start = end - len(word) - 1  # start annotation at beginning of word
                current_annotation = label   # append the word to the current annotation
            if label_type == 'I':            # if the annotation is multi-word
                current_annotation = label   # append the word
           
            if label != 'O' and label not in unique_labels:
                unique_labels.append(label)
 
        # lines with len == 1 are breaks between sentences
        if len(line) == 1:
            if current_annotation:
                entities.append((start, end - 1, current_annotation))
            sentence = " ".join(sentence)
            training_data.append([sentence, {'entities' : entities}])
            # reset the counters and temporary lists
            end = 0            
            entities, sentence = [], []
            current_annotation = None
    file.close()
    return training_data, unique_labels            
           
TRAIN_DATA, LABELS = load_data_spacy("data/train.txt")

In [12]:
print(*[x for x in TRAIN_DATA[1:10]], sep='\n')
print(LABELS)

['show me films with drew barrymore from the 1980s', {'entities': [(19, 33, 'ACTOR'), (43, 48, 'YEAR')]}]
['what movies starred both al pacino and robert deniro', {'entities': [(25, 34, 'ACTOR'), (39, 52, 'ACTOR')]}]
['find me all of the movies that starred harold ramis and bill murray', {'entities': [(39, 51, 'ACTOR'), (56, 67, 'ACTOR')]}]
['find me a movie with a quote about baseball in it', {'entities': []}]
['what movies have mississippi in the title', {'entities': [(17, 28, 'TITLE')]}]
['show me science fiction films directed by steven spielberg', {'entities': [(8, 29, 'GENRE'), (42, 58, 'DIRECTOR')]}]
['do you have any thrillers directed by sofia coppola', {'entities': [(16, 25, 'GENRE'), (38, 51, 'DIRECTOR')]}]
['what leonard cohen songs have been used in a movie', {'entities': [(5, 24, 'SONG')]}]
['show me films elvis films set in hawaii', {'entities': [(14, 19, 'ACTOR'), (26, 39, 'PLOT')]}]
['', 'ACTOR', 'YEAR', 'TITLE', 'GENRE', 'DIRECTOR', 'SONG', 'PLOT', 'REVIEW', 'CHARACTE

In [16]:
from spacy import displacy
import warnings
warnings.filterwarnings("ignore")

import en_core_web_sm
nlp = en_core_web_sm.load()
#nlp = spacy.load('en_core_web_sm')
TEST_DATA, _ = load_data_spacy("data/test.txt")

test_sentences = [x[0] for x in TEST_DATA[0:15]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = nlp(x)
    displacy.render(doc, jupyter = True, style = "ent")
warnings.filterwarnings("default")

In [17]:
# A simple decorator to log function processing time
def timer(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        print("Completed in {} seconds".format(int(te - ts)))
        return result
    return timed

# Data must be of the form (sentence, {entities: [start, end, label]})
@timer
def train_spacy(train_data, labels, iterations, dropout = 0.2, display_freq = 1):
    ''' Train a spacy NER model, which can be queried against with test data
   
    train_data : training data in the format of (sentence, {entities: [(start, end, label)]})
    labels : a list of unique annotations
    iterations : number of training iterations
    dropout : dropout proportion for training
    display_freq : number of epochs between logging losses to console
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
   
    # Add entity labels to the NER pipeline
    for i in labels:
        ner.add_label(i)

    # Disable other pipelines in SpaCy to only train NER
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        nlp.vocab.vectors.name = 'spacy_model' # without this, spaCy throws an "unnamed" error
        optimizer = nlp.begin_training()
        for itr in range(iterations):
            random.shuffle(train_data) # shuffle the training data before each iteration
            losses = {}
            batches = minibatch(train_data, size = compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(          
                    texts,
                    annotations,
                    drop = dropout,  
                    sgd = optimizer,
                    losses = losses)
            if itr % display_freq == 0:
                print("Iteration {} Loss: {}".format(itr + 1, losses))
    return nlp

# Train (and save) the NER model
ner = train_spacy(TRAIN_DATA, LABELS,6)
ner.to_disk("models/spacy_example")

  proc.begin_training(


Iteration 1 Loss: {'ner': 19431.911780422735}
Iteration 2 Loss: {'ner': 12924.012389415017}
Iteration 3 Loss: {'ner': 10953.757900691184}
Iteration 4 Loss: {'ner': 9912.242454589237}
Iteration 5 Loss: {'ner': 8942.424481595592}
Iteration 6 Loss: {'ner': 8423.218233748841}
Completed in 192 seconds


In [19]:
from spacy import displacy

# reload the trained model weights, and load_data is called to load and transform the test data. 
# The spacy function displacy is used to visualize the predictions of the first 15 test sentences. 

def load_model(model_path):
    ''' Loads a pre-trained model for prediction on new test sentences
   
    model_path : directory of model saved by spacy.to_disk
    '''
    nlp = spacy.blank('en')
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    ner = nlp.from_disk(model_path)
    return ner

ner = load_model("models/spacy_example")

TEST_DATA, _ = load_data_spacy("data/test.txt")

test_sentences = [x[0] for x in TEST_DATA[0:15]] # extract the sentences from [sentence, entity]
for x in test_sentences:
    doc = ner(x)
    displacy.render(doc, jupyter = True, style = "ent")

In [20]:
TRAIN_ENTITIES = {}
# test the trained model
for text, _ in TRAIN_DATA:
  doc = ner(text)
  for ent in doc.ents:
    ent_text, lbl = ent.text, ent.label_
    if lbl in TRAIN_ENTITIES:
      TRAIN_ENTITIES[lbl].append(ent_text)
    else:
      TRAIN_ENTITIES[lbl] = [ent_text]
  #print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
  #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [21]:
# Training dataset: Get all entities for each categories

print(len(TRAIN_ENTITIES))
for k, v in TRAIN_ENTITIES.items():
  unique_values = set(v)
  print(k, ': ', len(unique_values), '\n', sorted(list(unique_values), reverse=False))

12
DIRECTOR :  921 
 ['a f silver', 'a nolan brother', 'adam barnick', 'adam fields', 'adam warren', 'adam wimpenny', 'adrian vitoria', 'akira kurosawa', 'akira kurusawas', 'alan g parker', 'alberto cavalcanti', 'alex chapple', 'alex de rakoff', 'alex gibney', 'alex proyas', 'alex steyermark', 'alexander hall', 'alexander williams', 'alfred hitchcock', 'alfred hitchcocks', 'alfred vohrer', 'alison murray', 'allan a goldstein', 'alvin rakoff', 'amos kollek', 'andrei tarkovsky', 'andrew bellware', 'andrew cymek', 'andrew gurland', 'andrew jacobs', 'andrew knight', 'andrew leman', 'andrew r jones', 'andrew stevens', 'andy fickman', 'andy wachowski', 'ang lee', 'anjelica huston', 'ann turner', 'anne norda', 'anthony asquith', 'anthony bell', 'anthony hopkins', 'anthony minghella', 'anthony russo', 'antoine fuqua', 'antoine thomas', 'ari taub', 'arnold schwarzenegger', 'arthur alston', 'arthur marks', 'arthur vincie', 'ashley horner', 'axel sand', 'b scott omalley', 'bam margera', 'barbara 

In [23]:
def calc_precision(pred, true):        
    precision = len([x for x in pred if x in true]) / (len(pred) + 1e-20) # true positives / total pred
    return precision

def calc_recall(pred, true):
    recall = len([x for x in true if x in pred]) / (len(true) + 1e-20)    # true positives / total test
    return recall

def calc_f1(precision, recall):
    f1 = 2 * ((precision * recall) / (precision + recall + 1e-20))
    return f1

from itertools import chain

# run the predictions on each sentence in the test dataset, and return the spacy object
preds = [ner(x[0]) for x in TEST_DATA]

precisions, recalls, f1s = [], [], []
c = 0

# iterate over predictions and test data and calculate precision, recall, and F1-score
for pred_data, true in zip(preds, TEST_DATA):
    true = [x[2] for x in list(chain.from_iterable(true[1].values()))] # x[2] = annotation, true[1] = (start, end, annot)    
    pred = [i.label_ for i in pred_data.ents] # i.label_ = annotation label, pred.ents = list of annotations
    c = c + 1
    if c in [29,30, 31]:
      print('\n', pred_data.text, pred_data.ents)
      print('True: ', true)
      print('Pred: ', pred)
    precision = calc_precision(true, pred)
    precisions.append(precision)
    recall = calc_recall(true, pred)
    recalls.append(recall)
    f1s.append(calc_f1(precision, recall))
   
print("Precision: {} \nRecall: {} \nF1-score: {}".format(np.around(np.mean(precisions), 3),
                                                         np.around(np.mean(recalls), 3),
                                                         np.around(np.mean(f1s), 3)))


 what is a bronx tale rated ()
True:  ['TITLE', 'RATING']
Pred:  []

 is there a pg 13 movie thats scary (pg 13, scary)
True:  ['RATING', 'GENRE']
Pred:  ['RATING', 'GENRE']

 what movies made in 2004 were pg (2004, pg)
True:  ['YEAR', 'RATING']
Pred:  ['YEAR', 'RATING']
Precision: 0.855 
Recall: 0.872 
F1-score: 0.855


In [24]:
# Test dataset: Get all entities for each categories
TEST_ENTITIES = {}

for text, _ in TEST_DATA:
  doc = ner(text)
  for ent in doc.ents:
    ent_text, lbl = ent.text, ent.label_
    if lbl in TEST_ENTITIES:
      TEST_ENTITIES[lbl].append(ent_text)
    else:
      TEST_ENTITIES[lbl] = [ent_text]

print(len(TEST_ENTITIES))
for k, v in TEST_ENTITIES.items():
  unique_values = set(v)
  print(k, ': ', len(unique_values), '\n', sorted(list(unique_values), reverse=False))

12
GENRE :  123 
 ['action', 'action adventure', 'action comedy', 'action thriller', 'adult animated', 'adult fantasy', 'adult horror', 'adventure', 'animated', 'animated childrens', 'animation', 'anime', 'avant garde', 'avante garde', 'bio pic', 'biographical', 'biography', 'biographys', 'british', 'canadian', 'cartoon', 'cartoons', 'cheech', 'chick', 'child', 'children', 'childrens', 'comdedies', 'comedies', 'comedy', 'comedys', 'confession', 'cowboy', 'crime', 'crime drama', 'crime noir', 'culture', 'dark', 'decently', 'disaster', 'disney', 'disney cartoon', 'documentaries', 'documentary', 'drama', 'dramas', 'emotional', 'entertaining', 'entertainment', 'family', 'fantasy', 'film noir', 'foreign romantic', 'fun action', 'funny', 'gags', 'gangster', 'girl', 'historical', 'history', 'horror', 'horror comedy', 'independent', 'independent comedy', 'isolation adventure', 'jesus', 'kid', 'kids', 'kidz', 'laugh', 'love', 'mafia', 'melodrama', 'military', 'mockumentary', 'musical', 'musical

In [25]:
f = open('data/test_prediction.txt', 'w')
# test the trained model
for text, true_lbl in TEST_DATA:
  doc = ner(text)
  print(text, true_lbl)
  for ent in doc: #.ents: 
    #print(ent.text)
    #ent_text, lbl = ent.text, ent.label_
    if ent.ent_iob_ == 'O':
      f.write('O ' + ent.text + '\n')
      #print('O', ent.text) #, '\t', ent.tag, ent.idx)  
    else:
      f.write(ent.ent_iob_ + '-' + ent.ent_type_ + ' ' + ent.text + '\n')
      #print(ent.ent_iob_, '-', ent.ent_type_, ent.text) #, '\t', ent.tag, ent.idx)
    ent_text, lbl = ent.text, ent.tag_
    if lbl in TEST_ENTITIES:
      TEST_ENTITIES[lbl].append(ent_text)
    else:
      TEST_ENTITIES[lbl] = [ent_text]
  f.write('\n')
  #print()
  #if c == 10:
  #  break
  #c = c + 1

f.close()

  f = open('data/test_prediction.txt', 'w')


are there any good romantic comedies out right now {'entities': [(19, 36, 'GENRE'), (41, 50, 'YEAR')]}
show me a movie about cars that talk {'entities': [(22, 36, 'PLOT')]}
list the five star rated movies starring mel gibson {'entities': [(9, 18, 'RATINGS_AVERAGE'), (41, 51, 'ACTOR')]}
what science fiction films have come out recently {'entities': [(5, 20, 'GENRE'), (41, 49, 'YEAR')]}
did the same director make all of the harry potter movies {'entities': [(38, 50, 'TITLE')]}
show me 1980s action movies {'entities': [(8, 13, 'YEAR'), (14, 20, 'GENRE')]}
what is the name of the third movie in the star trek series {'entities': [(43, 59, 'TITLE')]}
can you get a soundtrac for the harry potter films {'entities': [(14, 23, 'SONG'), (32, 50, 'TITLE')]}
find me science fiction movies since 2005 {'entities': [(8, 23, 'GENRE'), (31, 41, 'YEAR')]}
what is the most current movie featuring mat damon {'entities': [(17, 24, 'YEAR'), (41, 50, 'ACTOR')]}
show me films where jim carrey is a detective {'

In [26]:
json_doc = doc.to_json()
print(json_doc)

{'text': 'what s the title of the movie about captain jack sparrow', 'ents': [{'start': 36, 'end': 56, 'label': 'PLOT'}], 'tokens': [{'id': 0, 'start': 0, 'end': 4}, {'id': 1, 'start': 5, 'end': 6}, {'id': 2, 'start': 7, 'end': 10}, {'id': 3, 'start': 11, 'end': 16}, {'id': 4, 'start': 17, 'end': 19}, {'id': 5, 'start': 20, 'end': 23}, {'id': 6, 'start': 24, 'end': 29}, {'id': 7, 'start': 30, 'end': 35}, {'id': 8, 'start': 36, 'end': 43}, {'id': 9, 'start': 44, 'end': 48}, {'id': 10, 'start': 49, 'end': 56}]}


In [70]:
import json

def to_disk(filename, document):        
    with open(filename, "w", encoding="utf8") as f:
        f.write(json.dumps(document))

def from_disk(filename):
    # This will receive the directory path + /my_component    
    with open(filename, "r", encoding="utf8") as f:
        document = json.loads(f)
    return document