# MIT Movie Dataset - spaCy Models

## Directions: 

First, run the "Load Spacy Formatted Data" section. 

Secondly, to check the performance of models that have been trained in this notebook jump to the "Evaluate Model" section of each model. There is no need to rerun all the training of the models - it will just write over the previous model.


In [None]:
import pandas as pd
import numpy as np

# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

os.chdir('/content/drive/My Drive/Colab Notebooks/DAAN888/data')
#os.chdir('/content/drive/My Drive/DAAN888/data')
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/DAAN888/data'

In [None]:
model_dir = '/content/drive/My Drive/Colab Notebooks/DAAN888/models/'
data_dir = '/content/drive/My Drive/Colab Notebooks/DAAN888/data/'

## Load Spacy Formatted Data


In [None]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [None]:
!pip install -U spacy[cuda101] # change your version based on above 10.1 = 101

Collecting spacy[cuda101]
[?25l  Downloading https://files.pythonhosted.org/packages/50/b2/12466d3018bb84b039139ef76436ea7a01e98125c2aee6a81e527eb4ebe1/spacy-2.3.4-cp36-cp36m-manylinux2014_x86_64.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 9.9MB/s 
Collecting thinc<7.5.0,>=7.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/1b/c9/ce2e03720a5647fd90da575325376ff258653a05f357aa970fd87e6c1a55/thinc-7.4.3-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 40.5MB/s 
Installing collected packages: thinc, spacy
  Found existing installation: thinc 7.4.0
    Uninstalling thinc-7.4.0:
      Successfully uninstalled thinc-7.4.0
  Found existing installation: spacy 2.2.4
    Uninstalling spacy-2.2.4:
      Successfully uninstalled spacy-2.2.4
Successfully installed spacy-2.3.4 thinc-7.4.3


In [None]:
!pip install spacy-lookups-data

Collecting spacy-lookups-data
[?25l  Downloading https://files.pythonhosted.org/packages/41/b7/d5c635e51718c874606fb08249c0fab710286240d54847dc60bad3dfceac/spacy_lookups_data-0.3.2.tar.gz (93.8MB)
[K     |████████████████████████████████| 93.8MB 110kB/s 
Building wheels for collected packages: spacy-lookups-data
  Building wheel for spacy-lookups-data (setup.py) ... [?25l[?25hdone
  Created wheel for spacy-lookups-data: filename=spacy_lookups_data-0.3.2-py2.py3-none-any.whl size=93807573 sha256=fbed1e8e68d2474b499f46cfe536441ba30e7fa356293fb4fbdb2e1310d76a13
  Stored in directory: /root/.cache/pip/wheels/5b/f4/d0/bf720a06127c95d9be2a81d197a3f1998ee5fc63410944e28f
Successfully built spacy-lookups-data
Installing collected packages: spacy-lookups-data
Successfully installed spacy-lookups-data-0.3.2


In [None]:
import spacy
activated = spacy.prefer_gpu()
print('GPU: ', activated)

GPU:  True


In [None]:
import json

with open('spacy_mitmovie_train.json', 'rb') as handle: 
  train = json.load(handle)

In [None]:
with open('spacy_mitmovie_test.json', 'rb') as handle: 
  test = json.load(handle)

In [None]:
train[0:2]

[['what movies star bruce willis', {'entities': [[17, 29, 'ACTOR']]}],
 ['show me films with drew barrymore from the 1980s',
  {'entities': [[19, 33, 'ACTOR'], [43, 48, 'YEAR']]}]]

In [None]:
#len(train)

In [None]:
#test[0]

In [None]:
movie_labels = ['ACTOR',
  'YEAR',
  'TITLE',
  'GENRE',
  'DIRECTOR',
  'SONG',
  'PLOT',
  'REVIEW',
  'CHARACTER',
  'RATING',
  'RATINGS_AVERAGE',
  'TRAILER']

In [None]:
len(movie_labels)

12

In [None]:
# custom tokenizer so that we can override Spacy's sophisticated one

from spacy.tokens import Doc

def custom_tokenizer(text):

    # override tokenizer so that it only splits on space
    tokens = text.split()

    return Doc(nlp.vocab, words=tokens)



---



### Model 1: Blank spaCy Model

In [None]:
# https://s3.amazonaws.com/assets.datacamp.com/production/course_8392/slides/chapter4.pdf

# Start with blank English model
nlp = spacy.blank('en', disable=['tagger', 'parser'])

nlp.tokenizer = custom_tokenizer

# Create blank entity recognizer and add it to the pipeline
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

# Add all new labels
for label in movie_labels:
  ner.add_label(label)

In [None]:
ner.labels

#### Train Model

In [None]:
#train_count = 38000
epochs = 10

In [None]:
import random
from datetime import datetime
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse

# https://spacy.io/usage/training#ner

start = datetime.now()

# Start the training
optimizer = nlp.begin_training()

# Train for n epochs
for itn in range(epochs):

    random.shuffle(train) # shuffle the examples each time through
    losses = {} # initialize empty dictionary to get losses

    # batch up the examples using spaCy's minibatch
    batches = minibatch(train, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
      texts, annotations = zip(*batch)
      # Update the model
      nlp.update(texts, annotations, drop=0.30, losses=losses )

    print("Losses", losses)

print('Time to train: ', datetime.now() - start)

In [None]:
nlp.meta['name'] = 'mitmovie_spacy_blank_drpt30'

In [None]:
nlp.to_disk(model_dir + nlp.meta['name'], exclude='tokenizer')

#### Evaluate Model

In [None]:
# load the model trained above
nlp = spacy.load(model_dir + 'mitmovie_spacy_blank_drpt30')
nlp.tokenizer = custom_tokenizer

In [None]:
# quick test on the trained model
for text, orig_ents in test[0:3]:
  doc = nlp(text)
  print("Extracted Entities:", [(ent.text, ent.label_) for ent in doc.ents])
  #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
  print("Original Entities:", orig_ents)
  print('Original Sentence:', text)
  print()

In [None]:
scorer = nlp.evaluate(test, verbose=False)

In [None]:
scorer.ents_per_type

In [None]:
scorer.ents_p, scorer.ents_r, scorer.ents_f
 # (86.23627413858387, 85.31560217269151, 85.77346765841259) for 10 epochs
 # (85.22388059701493, 85.55909346319535, 85.39115805215441) for 30 epochs



---



### Model 2: MIT Movie Word Embedding

#### Train Embedding

This embedding is on the training data of the MIT Movie dataset.

In [None]:
import pickle 

with open('mitmovie.pickle', 'rb') as handle:
    dataset = pickle.load(handle)

In [None]:
import gensim
from datetime import datetime

# choose desired word embedding depth
embedding_dim = 200

# choose minimum occurrences of a word in order for an embedding to be made
min_word_count = 1

start = datetime.now()
embedding = gensim.models.Word2Vec(sentences = dataset['train_tokens'], 
                                   size = embedding_dim, 
                                   window = 5, 
                                   workers = 4,
                                   min_count = min_word_count)
words = list(embedding.wv.vocab)
print('Vocabulary size in Word2Vec %d' % len(words))
print('\n')

print('Time to complete:', datetime.now() - start )

Vocabulary size in Word2Vec 6710


Time to complete: 0:00:02.303673


In [None]:
# save the embedding
filename = model_dir + 'MITMovie/embeddings/' + 'trainset_embedding_word2vec.txt'
embedding.wv.save_word2vec_format(filename, binary=False)

In [None]:
#import gensim
#from datetime import datetime

##embedding = gensim.models.Word2Vec.load(model_dir + 'MITMovie/embeddings/' + 'trainset_embedding_word2vec.txt.gz')

In [None]:
[word for (word, score) in embedding.wv.most_similar('starred')]

['during',
 'filmed',
 'star',
 'over',
 'this',
 'main',
 'it',
 'played',
 'co',
 'role']

In [None]:
os.chdir(model_dir + 'MITMovie/embeddings/')

In [None]:
# https://info.cambridgespark.com/latest/word-embeddings-in-python
!gzip trainset_embedding_word2vec.txt

In [None]:
!python -m spacy init-model en ./spacy.word2vec.model --vectors-loc trainset_embedding_word2vec.txt.gz

#### Train Model

In [None]:
os.chdir(model_dir + 'MITMovie/embeddings/')

In [None]:
nlp = spacy.load('./spacy.word2vec.model/')
nlp.tokenizer = custom_tokenizer

In [None]:
tokens = nlp(u'what actor starred in a film about jerry lee lewis')

In [None]:
for token in tokens:
  print(token.text, token.has_vector, token.vector_norm, token.is_oov) # oov is does the word have a vector

In [None]:
# Create blank entity recognizer and add it to the pipeline
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [None]:
# Add all new labels
for label in movie_labels:
  ner.add_label(label)

In [None]:
epochs = 10

In [None]:
import random
from datetime import datetime
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse


start = datetime.now()

# Start the training
optimizer = nlp.begin_training()

# Train for n epochs
for itn in range(epochs):

    random.shuffle(train) # shuffle the examples each time through

    losses = {} # initialize empty dictionary to get losses

    # batch up the examples using spaCy's minibatch
    batches = minibatch(train, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
      texts, annotations = zip(*batch)
      # Update the model
      nlp.update(texts, annotations, drop=0.30, losses=losses )

    print("Losses", losses)

print('Time to train: ', datetime.now() - start)

In [None]:
custom_spacy.meta['name'] = 'mitmovie_spacy_customembed_drpt30'

In [None]:
from pathlib import Path

output_dir = Path(model_dir + custom_spacy.meta['name'])

if not output_dir.exists():
  output_dir.mkdir()
  custom_spacy.to_disk(output_dir)

#### Evaluate Model

In [None]:
custom_spacy = spacy.load(model_dir + 'mitmovie_spacy_customembed_drpt30')

In [None]:
# quick test on the trained model
for text, orig_ents in train[0:3]:
  doc = custom_spacy(text)
  print("Extracted Entities:", [(ent.text, ent.label_) for ent in doc.ents])
  #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
  print("Original Entities:", orig_ents)
  print('Original Sentence:', text)
  print()

In [None]:
# test on remaining samples
scorer = custom_spacy.evaluate(test, verbose=False)

In [None]:
scorer.ents_per_type

In [None]:
scorer.ents_p, scorer.ents_r, scorer.ents_f



---



### Model 3: IMDB Word Embedding

In [None]:
# retrieve large corpus of IMDB review for pre-training a word embedding
# http://ai.stanford.edu/~amaas/data/sentiment/
# http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.bib


# don't rerun this code it will re-download the file
#!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

#### Train Embedding

Don't rerun this section of training of word embedding because it's lengthy run-time. Jump straight "Evaluate Model" and run. 

In [None]:
!pip install nlp



In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

#don't rerun this code it will re-download the file
# #https://huggingface.co/datasets/imdb
dataset = load_dataset("imdb")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1902.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1004.0, style=ProgressStyle(description…


Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.06 MiB, post-processed: Unknown size, total: 207.28 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=84125825.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3. Subsequent calls will reuse this data.


In [None]:
# import pickle

# with open(data_dir + 'imdb.pickle', 'wb') as handle:
#     pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# import pickle

# with open(data_dir + 'imdb.pickle', 'r') as handle:
#   dataset = pickle.load(handle)

In [None]:
dataset.keys()

In [None]:
dataset['train']['text'][0]

In [None]:
dataset['test']['text'][0]

In [None]:
dataset['unsupervised']['text'][0]

In [None]:
import re

# combine all texts
imdb_all = []
for ds in ['train', 'test', 'unsupervised']:
  for t in dataset[ds]['text']:
    imdb_all.append(re.sub('[^A-Za-z0-9]+', ' ', t.lower()))

len(imdb_all)

100000

In [None]:
imdb_all = [sent.split() for sent in imdb_all]

In [None]:
imdb_all[0][0:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

In [None]:
import gensim
from datetime import datetime

# choose desired word embedding depth
embedding_dim = 200

# choose minimum occurrences of a word in order for an embedding to be made
min_word_count = 3

start = datetime.now()
embedding = gensim.models.Word2Vec(sentences = imdb_all, 
                                   size=embedding_dim, 
                                   window = 5, 
                                   workers = 4, 
                                   min_count= min_word_count,
                                   )
words = list(embedding.wv.vocab)
print('Vocabulary size in Word2Vec %d' % len(words))
print('\n')

print('Time to complete:', datetime.now() - start )

Vocabulary size in Word2Vec 68608


Time to complete: 0:03:36.920461


In [None]:
filename = model_dir + 'MITMovie/embeddings/' + 'imdb_embedding_word2vec.txt'
embedding.wv.save_word2vec_format(filename, binary=False)

In [None]:
import warnings
warnings.filterwarnings("ignore")

[word for (word, score) in embedding.wv.most_similar('starred')]

['starring',
 'appeared',
 'featured',
 'sang',
 'helmed',
 'outshines',
 'boasted',
 'participated',
 'collaborated',
 'wrote']

In [None]:
os.chdir(model_dir + 'MITMovie/embeddings/')

In [None]:
# https://info.cambridgespark.com/latest/word-embeddings-in-python
!gzip imdb_embedding_word2vec.txt

In [None]:
!python -m spacy init-model en ./spacy.word2vec.imdb.model --vectors-loc imdb_embedding_word2vec.txt.gz

#### Train Model

In [None]:
os.chdir(model_dir + 'MITMovie/embeddings/')

In [None]:
nlp = spacy.load('./spacy.word2vec.imdb.model/')
nlp.tokenizer = custom_tokenizer

In [None]:
tokens = nlp(u'what actor starred in a film about jerry lee lewis')

In [None]:
for token in tokens:
  print(token.text, token.has_vector, token.vector_norm, token.is_oov) # oov is does the word have a vector

In [None]:
# Create blank entity recognizer and add it to the pipeline
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

In [None]:
# Add all new labels
for label in movie_labels:
  ner.add_label(label)

In [None]:
epochs = 10

In [None]:
import random
from datetime import datetime
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse


start = datetime.now()

# Start the training
optimizer = nlp.begin_training()

# Train for n epochs
for itn in range(epochs):

    random.shuffle(train) # shuffle the examples each time through

    losses = {} # initialize empty dictionary to get losses

    # batch up the examples using spaCy's minibatch
    batches = minibatch(train, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
      texts, annotations = zip(*batch)
      # Update the model
      nlp.update(texts, annotations, drop=0.30, losses=losses )

    print("Losses", losses)

print('Time to train: ', datetime.now() - start)

In [None]:
nlp.meta['name'] = 'mitmovie_spacy_customembed_imdb_drpt30'

In [None]:
nlp.to_disk(model_dir + nlp.meta['name'], exclude='tokenizer')

#### Evaluate Model

In [None]:
# load the model trained above
nlp = spacy.load(model_dir + 'mitmovie_spacy_customembed_imdb_drpt30')
nlp.tokenizer = custom_tokenizer 

In [None]:
# quick test on the trained model
for text, orig_ents in test[0:3]:
  doc = nlp(text)
  print("Extracted Entities:", [(ent.text, ent.label_) for ent in doc.ents])
  #print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
  print("Original Entities:", orig_ents)
  print('Original Sentence:', text)
  print()

In [None]:
# test on remaining samples
scorer = nlp.evaluate(test, verbose=False)

In [None]:
scorer.ents_per_type

In [None]:
scorer.ents_p, scorer.ents_r, scorer.ents_f



---



---



## Evaluate Any spaCy Model with BIO format

### Load BIO Format Test Data

In [None]:
os.chdir('/content/drive/My Drive/Colab Notebooks/DAAN888/data')

In [None]:
import pickle 

with open('mitmovie.pickle', 'rb') as handle:
    dataset = pickle.load(handle)

In [None]:
dataset['test_labels'][0]

['O', 'O', 'O', 'O', 'B-GENRE', 'I-GENRE', 'O', 'B-YEAR', 'I-YEAR']

### Load Chosen spaCy Model


In [None]:
#model = 'mitmovie_spacy_blank_drpt30'
#model = 'mitmovie_spacy_customembed_drpt30'
model = 'mitmovie_spacy_customembed_imdb_drpt30'

In [None]:
nlp = spacy.load(model_dir + model)

# force spacy to accept the tokenization we give it
nlp.tokenizer = custom_tokenizer 

### Get Predictions

In [None]:
# https://stackoverflow.com/questions/59200123/converting-spacy-training-data-format-to-spacy-cli-format-for-blank-ner/59209377#59209377
# https://towardsdatascience.com/extend-named-entity-recogniser-ner-to-label-new-entities-with-spacy-339ee5979044

import spacy
from spacy.gold import biluo_tags_from_offsets

docs = []
pred_tags = []
for text, annot in test:

    doc = nlp(text)
    offsets = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    preds = biluo_tags_from_offsets(doc, offsets)
    preds = [tag.replace('L-', 'I-') for tag in preds]
    preds = [tag.replace('U-', 'B-') for tag in preds]
    pred_tags.append(preds)
    #print(tags)
    #entities = spans_from_biluo_tags(doc, tags)
    #doc.ents = entities
    #docs.append(doc)

In [None]:
pred_tags[0:5]

[['O', 'O', 'O', 'O', 'B-GENRE', 'I-GENRE', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'B-PLOT', 'O', 'O'],
 ['O', 'O', 'B-RATINGS_AVERAGE', 'O', 'O', 'O', 'O', 'B-ACTOR', 'I-ACTOR'],
 ['O', 'B-GENRE', 'I-GENRE', 'O', 'O', 'O', 'O', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TITLE', 'I-TITLE', 'O']]

In [None]:
dataset['test_labels'][0:5]

[['O', 'O', 'O', 'O', 'B-GENRE', 'I-GENRE', 'O', 'B-YEAR', 'I-YEAR'],
 ['O', 'O', 'O', 'O', 'O', 'B-PLOT', 'I-PLOT', 'I-PLOT'],
 ['O',
  'O',
  'B-RATINGS_AVERAGE',
  'I-RATINGS_AVERAGE',
  'O',
  'O',
  'O',
  'B-ACTOR',
  'I-ACTOR'],
 ['O', 'B-GENRE', 'I-GENRE', 'O', 'O', 'O', 'O', 'B-YEAR'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TITLE', 'I-TITLE', 'O']]

In [None]:
# confirm same number of docs in predicted and actual
len(pred_tags), len(dataset['test_labels'])

(2443, 2443)

In [None]:
# get index if lengths don't match
not_match = []
for idx, pair in enumerate(zip(pred_tags, dataset['test_labels'])):
  #adj_test = [tag for tag in pair[1] if tag != 'O']
  if len(pair[0]) != len(pair[1]):
    not_match.append(idx)

In [None]:
not_match

[]

In [None]:
# since tags are in list of list, stretch them out into one continuous list
preds_stretched = [label for doc in pred_tags for label in doc]
trues_stretched = [label for doc in dataset['test_labels'] for label in doc]

In [None]:
from sklearn.metrics import classification_report


f = open(model_dir + model  + '/class_report_test.txt', 'w') 

class_report = classification_report(trues_stretched, preds_stretched)
print(class_report, file=f ) 

f.close() 


print(class_report)

  _warn_prf(average, modifier, msg_start, len(result))


                   precision    recall  f1-score   support

          B-ACTOR       0.88      0.95      0.91       812
      B-CHARACTER       0.63      0.59      0.61        90
       B-DIRECTOR       0.94      0.81      0.87       456
          B-GENRE       0.94      0.96      0.95      1117
           B-PLOT       0.72      0.73      0.72       491
         B-RATING       0.97      0.97      0.97       500
B-RATINGS_AVERAGE       0.93      0.91      0.92       451
         B-REVIEW       0.33      0.09      0.14        56
           B-SONG       0.84      0.59      0.70        54
          B-TITLE       0.82      0.83      0.82       562
        B-TRAILER       0.81      0.87      0.84        30
           B-YEAR       0.95      0.95      0.95       720
          I-ACTOR       0.88      0.95      0.91       862
      I-CHARACTER       0.57      0.51      0.54        75
       I-DIRECTOR       0.94      0.80      0.86       496
          I-GENRE       0.94      0.70      0.80       

In [None]:
!pip install seqeval



In [None]:
from seqeval.metrics import classification_report as classification_report_seqeval

f = open(model_dir + model  + '/seq_class_report_test.txt', 'w') 

seq_class_report = classification_report_seqeval(dataset['test_labels'], pred_tags)
print(seq_class_report, file=f ) 

f.close() 
print(seq_class_report)

                 precision    recall  f1-score   support

          ACTOR       0.87      0.94      0.90       812
      CHARACTER       0.61      0.57      0.59        90
       DIRECTOR       0.93      0.80      0.86       456
          GENRE       0.91      0.93      0.92      1117
           PLOT       0.67      0.68      0.67       491
         RATING       0.93      0.93      0.93       500
RATINGS_AVERAGE       0.87      0.86      0.86       451
         REVIEW       0.33      0.09      0.14        56
           SONG       0.61      0.43      0.50        54
          TITLE       0.80      0.80      0.80       562
        TRAILER       0.81      0.87      0.84        30
           YEAR       0.95      0.94      0.95       720

      micro avg       0.87      0.86      0.86      5339
      macro avg       0.77      0.74      0.75      5339
   weighted avg       0.86      0.86      0.86      5339



## Visualize Predictions

In [None]:
sample = [text for i, (text, offsets) in enumerate(test[80:89])]
sample.extend( [text for text,offsets in test[90:91] ])
sample

['what was channing tatums first movie',
 'i would like a list of movies about dancing from the past 10 years',
 'who stars in project x',
 'find action movies featuring comic book characters',
 'what are some g rated movies with fairies',
 'name a movie starring britney spears',
 'what movie did rod serling write',
 'is there an animated adult fantasy movie',
 'the song sunshine on my shoulders was the soundtrack for what movie',
 'find me the name of the actor that played v in v for vendetta']

In [None]:
color = "linear-gradient(90deg, #aa9cfc, #fc9ce7)"
color_list = ['aquamarine', 'cyan', 'lightblue', 'lavender', 'teal', 'coral', 'turquoise', 'beige', 'salmon', 'lightgreen', 'azure', 'silver']
colors = {label: color for color, label in zip(color_list,movie_labels)}
options = {"ents": movie_labels, "colors": colors}

In [None]:
from spacy import displacy

for text in sample:
  doc = nlp(text)
  displacy.render(doc, style="ent", jupyter=True, options=options)

In [None]:
colors

{'ACTOR': 'aquamarine',
 'CHARACTER': 'azure',
 'DIRECTOR': 'teal',
 'GENRE': 'biege',
 'PLOT': 'turquoise',
 'RATING': 'lightgreen',
 'RATINGS_AVERAGE': 'salmon',
 'REVIEW': 'lavender',
 'SONG': 'coral',
 'TITLE': 'lightblue',
 'TRAILER': 'silver',
 'YEAR': 'cyan'}

In [None]:
movie_labels

['ACTOR',
 'YEAR',
 'TITLE',
 'GENRE',
 'DIRECTOR',
 'SONG',
 'PLOT',
 'REVIEW',
 'CHARACTER',
 'RATING',
 'RATINGS_AVERAGE',
 'TRAILER']

# Unused Code

In [None]:
from spacy.gold import GoldParse
from spacy.gold import biluo_tags_from_offsets, iob_to_biluo, offsets_from_biluo_tags
from spacy.tokens import Doc

def docs_golds( data):
  #docs = [nlp.make_doc(tokens) for tokens, labels in data]
  docs = [Doc(nlp.vocab, words=tokens) for tokens, labels in data]
  tags = [tags for tokens, tags in data]
  biluos = [iob_to_biluo(ls) for ls in tags] # convert to bilou format

  golds = [GoldParse(doc, entities=tag) for doc, tag in zip(docs, biluos)]

  return docs, golds