In [2]:
import spacy
import random
import datetime
import json
import math
import dateutil.parser
from pathlib import Path
from spacy.util import minibatch, compounding

#is_using_gpu = spacy.prefer_gpu()
#print(is_using_gpu)

largenlp = spacy.load('en_core_web_lg')

### JSON Content utlities for the TMDB corpus

In [3]:
#Iterate through the movies
def rawTmdbMovies(filename):
    return json.load(open(filename))


def writeTmdmMovies(rawMoviesJson, path):
    with open(path, 'w') as f:
        json.dump(rawMoviesJson, f)

def tmdbMovies(filename="../tmdb.json"):
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        yield (movieId, tmdbMovie)

def indexableMovies(filename="../tmdb.json"):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    for movieId, tmdbMovie in tmdbMovies(filename):
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date'] + 'T00:00:00Z'

            yield {'id': movieId,
                   'title': tmdbMovie['title'],
                   'overview': tmdbMovie['overview'],
                   'tagline': tmdbMovie['tagline'],
                   'directors': [director['name'] for director in tmdbMovie['directors']],
                   'cast': [castMember['name'] for castMember in tmdbMovie['cast']],
                   'genres': [genre['name'] for genre in tmdbMovie['genres']],
                   'release_date': releaseDate,
                   'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None,
                   'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None,
                   'location_entities': tmdbMovie['location_entities'] if 'location_entities' in tmdbMovie else [],
                   'location': tmdbMovie['location'] if 'location' in tmdbMovie else [],
                   'location_city': tmdbMovie['location_city'] if 'location_city' in tmdbMovie else [],
                   'location_state': tmdbMovie['location_state'] if 'location_state' in tmdbMovie else [],
                   'location_country': tmdbMovie['location_country'] if 'location_country' in tmdbMovie else [],
                   }
        except KeyError as k: # Ignore any movies missing these attributes
            continue

### Get the overview texts

We're going to use the overview text to train lowercase entities for our queries.

This is free training data!  We'll take the entities from the true-cased overview text, and apply them to the lower-cased text, and train a new model

In [4]:
def overviews(filename="../tmdb.json"):
    #Get all the overviews
    texts = []
    for movie_id, movie in tmdbMovies(filename=filename):
        if "overview" in movie and isinstance(movie["overview"], str) and len(movie["overview"]):
            texts.append(movie["overview"])
    return texts

In [5]:
overview_texts = overviews()

In [5]:
#For all the overviews in all the movies, find their entities, and test if they are found when lowercased
def get_ent_info(span):
    return [ent.text+'_'+str(ent[0].i)+'_'+str(ent[0].i+len(ent.text))+'_'+ent.label_ for ent in span.ents]
def show_lowercase_entity_differences(texts):
    for doc in largenlp.pipe(texts):
        ents_orig = get_ent_info(doc)
        lowertext = largenlp(doc.text.lower())
        ents_lower = get_ent_info(lowertext)
        if len(ents_orig) != len(ents_lower):
            print(doc.text)
            print(ents_orig,ents_lower)
        print('-----------------------------')
show_lowercase_entity_differences(overview_texts[0:100])        

This feature-length special consists of three interwoven stories. In a mysterious and remote snowy outpost, Matt and Potter share a Christmas meal, swapping creepy tales of their earlier lives in the outside world. Matt is a charismatic American trying to bring the reserved, secretive Potter out of his shell. But are both men who they appear to be? A woman gets thrust into a nightmarish world of 'smart' gadgetry. Plus a look at what would happen if you could 'block' people in real life.
['three_7_12_CARDINAL', 'Matt_19_23_PERSON', 'Christmas_24_33_DATE', 'Matt_39_43_PERSON', 'American_43_51_NORP'] ['three_7_12_CARDINAL', 'american_43_51_NORP']
-----------------------------
Raj is a rich, carefree, happy-go-lucky second generation NRI. Simran is the daughter of Chaudhary Baldev Singh, who in spite of being an NRI is very strict about adherence to Indian values. Simran has left for India to be married to her childhood fiancé. Raj leaves for India with a mission at his hands, to claim his

-----------------------------
Professional photographer L.B. "Jeff" Jeffries breaks his leg while getting an action shot at an auto race. Confined to his New York apartment, he spends his time looking out of the rear window observing the neighbors. He begins to suspect that a man across the courtyard may have murdered his wife. Jeff enlists the help of his high society fashion-consultant girlfriend Lisa Freemont and his visiting nurse Stella to investigate.
['L.B. "Jeff" Jeffries_2_22_PERSON', 'New York_23_31_GPE', 'Jeff_57_61_PERSON', 'Lisa Freemont_69_82_PERSON', 'Stella_75_81_PERSON'] []
-----------------------------
A former Prohibition-era Jewish gangster returns to the Lower East Side of Manhattan over thirty years later, where he once again must confront the ghosts and regrets of his old life.
['Jewish_5_11_NORP', 'the Lower East Side of Manhattan_9_41_LOC', 'thirty years later_16_34_DATE'] ['jewish_5_11_NORP', 'thirty years later_16_34_DATE']
-----------------------------
In th

A French actress filming an anti-war film in Hiroshima has an affair with a married Japanese architect as they share their differing perspectives on war.
['French_1_7_NORP', 'Hiroshima_10_19_GPE', 'Japanese_17_25_NORP'] ['japanese_17_25_NORP']
-----------------------------
FBI trainee, Clarice Starling ventures into a maximum-security asylum to pick the diseased brain of Hannibal Lecter, a psychiatrist turned homicidal cannibal. Starling needs clues to help her capture a serial killer. but her Faustian relationship with Lecter soon leads to his escape, and now two deranged killers are on the loose.
['FBI_0_3_ORG', 'Clarice Starling_3_19_PERSON', 'Hannibal Lecter_18_33_PERSON', 'Starling_27_35_PERSON', 'Faustian_40_48_PERSON', 'Lecter_43_49_PERSON', 'two_52_55_CARDINAL'] ['two_52_55_CARDINAL']
-----------------------------
-----------------------------
-----------------------------
-----------------------------
-----------------------------
The Pianist is a film adapted from the biograp

In [6]:
#For all the overviews in all the movies, find their entities, and assign them to the lowercase text.
#...this takes about 5 mins to run
def get_ents(span):
    return [(ent[0].i,ent[0].i+len(ent.text),ent.label_) for ent in span.ents]
def make_training_data(texts):
    training_data = []
    for doc in largenlp.pipe(texts):
        ents_orig = get_ents(doc)
        lowertext = doc.text.lower()
        training_data.append((lowertext, {"entities":ents_orig}))
    return training_data
training_data = make_training_data(overview_texts)

In [7]:
print(len(training_data))
training_data[0:100]

27449


[("this feature-length special consists of three interwoven stories. in a mysterious and remote snowy outpost, matt and potter share a christmas meal, swapping creepy tales of their earlier lives in the outside world. matt is a charismatic american trying to bring the reserved, secretive potter out of his shell. but are both men who they appear to be? a woman gets thrust into a nightmarish world of 'smart' gadgetry. plus a look at what would happen if you could 'block' people in real life.",
  {'entities': [(7, 12, 'CARDINAL'),
    (19, 23, 'PERSON'),
    (24, 33, 'DATE'),
    (39, 43, 'PERSON'),
    (43, 51, 'NORP')]}),
 ('raj is a rich, carefree, happy-go-lucky second generation nri. simran is the daughter of chaudhary baldev singh, who in spite of being an nri is very strict about adherence to indian values. simran has left for india to be married to her childhood fiancé. raj leaves for india with a mission at his hands, to claim his lady love under the noses of her whole family. th

In [8]:
training_set = training_data[:20000]
testing_set = training_data[20000:]

In [9]:
#Train the model, which takes about 10 minutes
def train_new_model(training_data):
    nlp = spacy.blank("en")
    optimizer = nlp.begin_training()
    n_iter=20
    for i in range(n_iter):
        print('Epoch:',i+1,'of',n_iter)
        random.shuffle(training_data)
        for text, annotations in training_data:
            nlp.update([text], [annotations], sgd=optimizer)
    nlp.to_disk("tmdb_toy_query_model")

In [10]:
train_new_model(training_set)

Epoch: 0 of 20
Epoch: 1 of 20
Epoch: 2 of 20
Epoch: 3 of 20
Epoch: 4 of 20
Epoch: 5 of 20
Epoch: 6 of 20
Epoch: 7 of 20
Epoch: 8 of 20
Epoch: 9 of 20
Epoch: 10 of 20
Epoch: 11 of 20
Epoch: 12 of 20
Epoch: 13 of 20
Epoch: 14 of 20
Epoch: 15 of 20
Epoch: 16 of 20
Epoch: 17 of 20
Epoch: 18 of 20
Epoch: 19 of 20


### Testing our new model
Load the model from disk that we created, and see if it can recognize lowercase entities (it doesn't, why not?):

In [13]:
#For all the overviews in all the movies, find their entities, and assign them to the lowercase text.
#...this takes about 5 mins to run
def test_lowercase_model(model,data):
    testnlp = spacy.load(model)
    print("Loaded model",model)
    texts = data[0:len(data)]
    for i in range(len(texts)):
        texts[i][1]['i']=i
    total = len(texts)
    print("Total examples:",total)
    yes=0
    no=0
    for doc, context in testnlp.pipe(texts,as_tuples=True):
        if len(doc.ents) == len(context['entities']):
            yes+=1
        else:
            no+=1
    print('Successful:',yes,'(',yes/total,'%)')
    print('Failed:',no,'(',no/total,'%)')
test_lowercase_model('./tmdb_toy_query_model',testing_set)

Loaded model ./tmdb_toy_query_model
Total examples: 7449
Successful: 718 ( 0.09638877701704927 %)
Failed: 6731 ( 0.9036112229829507 %)


In [10]:
querynlp = spacy.load('./tmdb_toy_query_model')
querytext = "boxing revenge in moscow"
print('Large model:',largenlp(querytext).ents)
print('New model:',querynlp(querytext).ents)

Large model: (moscow,)
New model: ()


### Transfer learning

We need to continue training using the existing *en_core_web_lg* model, so that we don't lose everything its already learned!

In [18]:
def prep_transfer_learning(model,training_data):
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
    #Get the ner pipe so we can add labels
    ner = nlp.get_pipe("ner")
    # add labels
    for _, annotations in training_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return nlp,ner

In [19]:
def train_existing_model(nlp,ner,training_data):
    output_dir='./tmdb_query_model'
    n_iter=20    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(training_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print('Epoch:',itn+1,'of',n_iter)
            print('Losses', losses)

    # save model to output directory
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [20]:
transnlp,transner = prep_transfer_learning("en_core_web_lg",training_set)

Loaded model 'en_core_web_lg'


In [23]:
train_existing_model(transnlp,transner,training_set[:10000])

Losses {'ner': 56252.97252117867}
Losses {'ner': 56207.940612408944}
Losses {'ner': 55151.57231537759}
Losses {'ner': 55216.83364187772}
Losses {'ner': 54943.65178064932}
Losses {'ner': 54966.76240671674}
Losses {'ner': 55865.07480183536}
Losses {'ner': 55489.659635768956}
Losses {'ner': 54562.96989896403}
Losses {'ner': 55109.67239970684}
Losses {'ner': 55334.162105280586}
Losses {'ner': 54967.77489028198}
Losses {'ner': 55034.96387214062}
Losses {'ner': 54469.56539990203}
Losses {'ner': 55288.294746215324}
Losses {'ner': 55195.8961237864}
Losses {'ner': 54704.64909127942}
Losses {'ner': 56133.93299452445}
Losses {'ner': 55438.86867170918}
Losses {'ner': 54492.018196559184}
Losses {'ner': 54787.59229738169}
Losses {'ner': 54845.52646047443}
Losses {'ner': 55003.23486702081}
Losses {'ner': 54884.91997823828}
Losses {'ner': 54985.74384881838}
Losses {'ner': 54663.25653532629}
Losses {'ner': 54908.935160860885}
Losses {'ner': 54759.01357696948}
Losses {'ner': 55861.84737028328}
Losses {'

In [15]:
test_lowercase_model('./tmdb_query_model',testing_set)

Loaded model ./tmdb_query_model
Total examples: 7449
Successful: 873 ( 0.11719693918646798 %)
Failed: 6576 ( 0.882803060813532 %)


In [14]:
test_lowercase_model('en_core_web_lg',testing_set)

Loaded model en_core_web_lg
Total examples: 7449
Successful: 1529 ( 0.20526245133574977 %)
Failed: 5920 ( 0.7947375486642503 %)


In [54]:
#For all the overviews in all the movies, find their entities, and assign them to the lowercase text.
#...this takes about 5 mins to run
def get_query_ents(span):
    return [(ent[0].i,ent[0].i+len(ent.text),ent.label_) for ent in span.ents]
def make_query_data(texts):
    query_data = []
    for doc in largenlp.pipe(texts):
        for sent in doc.sents:
            predicate = None
            ents_orig = []
            for tok in sent:
                if tok.dep_ == "ROOT":
                    predicate = tok
            for ent in sent.ents:
                for tok in ent:
                    if ent.label_ in ["GPE","PERSON"] and tok.head.text==predicate.text: #and tok.dep_ in ["dobj","attr"]:
                        ents_orig.append(ent)
            if(len(ents_orig)):
                lowertext = predicate.text.lower() # + '|' + sent.text
                query_data.append((lowertext, {"entities":ents_orig}))
    return query_data
query_data = make_query_data(overview_texts)

In [55]:
len(query_data)

20818

In [58]:
random.shuffle(query_data)
query_data

[('crosses', {'entities': [Jack Sparrow]}),
 ('is', {'entities': [Billy Wong]}),
 ('travels', {'entities': [Leo travels, Leo travels]}),
 ('is', {'entities': [Beethoven]}),
 ('watched', {'entities': [Marnie]}),
 ('is', {'entities': [Bernard Quatermass]}),
 ('is', {'entities': [Mary Stuart]}),
 ('thinks', {'entities': [Kitty]}),
 ('germany', {'entities': [Germany]}),
 ('has', {'entities': [Colin]}),
 ('living', {'entities': [Carrie Watts]}),
 ('charmed', {'entities': [Jeff Warren]}),
 ('lost', {'entities': [Grant MacLaine]}),
 ('be', {'entities': [Kurt]}),
 ('describes', {'entities': [Hasselbach]}),
 ('travels', {'entities': [Lee]}),
 ('overpower', {'entities': [Laurent]}),
 ('sends', {'entities': [Gerald, Alan]}),
 ('going', {'entities': [J.J. Horbart]}),
 ('making', {'entities': [Emily Hagins]}),
 ('enters', {'entities': [Almásy]}),
 ('takes', {'entities': [Hussin]}),
 ('find', {'entities': [Riddick]}),
 ('fend', {'entities': [Albert]}),
 ('is', {'entities': [Patience]}),
 ('recover',

## Use Lexico-syntactic patterns

In [29]:
from spacy.matcher import Matcher
matcher = Matcher(largenlp.vocab)
pattern = [{"POS": "VERB"}, {"LABEL": ["GPE","PERSON"], "POS":"PROPN"}, {"LABEL": ["GPE","PERSON"], "POS":"PROPN", "OP":"+"}]
matcher.add("VerbProperNoun", None, pattern)
matches = []
for doc in largenlp.pipe(overview_texts):
    for match in matcher(doc):
        example = [tok.lemma_ for tok in doc[match[1]:match[2]]]
        matches.append(example)
matches

[['calculate', 'Christine', 'Vole'],
 ['name', 'Walter', 'Kurtz'],
 ['retire', 'San', 'Francisco'],
 ['unsuspecte', 'Mr.', 'Dietrichson'],
 ['represent', 'Tom', 'Robinson'],
 ['keep', 'Princess', 'Leia'],
 ['call', 'John', 'Merrick'],
 ['occupy', 'Northern', 'France'],
 ['include', 'Sir', 'Bedevere'],
 ['enter', 'Police', 'Chief'],
 ['enter', 'Police', 'Chief', 'Marge'],
 ['schließt', 'Peppino', 'sich'],
 ['schließt', 'Peppino', 'sich', 'den'],
 ['schließt', 'Peppino', 'sich', 'den', 'Kommunisten'],
 ['find', 'King', 'Kong'],
 ['name', 'Augustus', 'Waters'],
 ['bring', 'Buzz', 'Lightyear'],
 ['capture', 'Eddie', 'Murphy'],
 ['name', 'Cole', 'Sear'],
 ['name', 'Malcolm', 'Crowe'],
 ['flag', 'MV', 'Maersk'],
 ['flag', 'MV', 'Maersk', 'Alabama'],
 ['meet', 'Leonardo', 'da'],
 ['meet', 'Leonardo', 'da', 'Vinci'],
 ['star', 'Libuse', 'Safrankova'],
 ['be', 'Stanley', 'Kubrick'],
 ['include', 'Dutch', 'Engstrom'],
 ['meet', 'Jimmy', 'Malone'],
 ['disturb', 'Blanche', 'DuBois'],
 ['wind', 'Mu