In [1]:
import spacy
import random
import datetime
import json
import math
import dateutil.parser
from pathlib import Path
from spacy.util import minibatch, compounding

largenlp = spacy.load('en_core_web_lg')

### JSON Content utlities for the TMDB corpus

In [2]:
#Iterate through the movies
def rawTmdbMovies(filename):
    return json.load(open(filename))


def writeTmdmMovies(rawMoviesJson, path):
    with open(path, 'w') as f:
        json.dump(rawMoviesJson, f)

def tmdbMovies(filename="../tmdb.json"):
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        yield (movieId, tmdbMovie)

def indexableMovies(filename="../tmdb.json"):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    for movieId, tmdbMovie in tmdbMovies(filename):
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date'] + 'T00:00:00Z'

            yield {'id': movieId,
                   'title': tmdbMovie['title'],
                   'overview': tmdbMovie['overview'],
                   'tagline': tmdbMovie['tagline'],
                   'directors': [director['name'] for director in tmdbMovie['directors']],
                   'cast': [castMember['name'] for castMember in tmdbMovie['cast']],
                   'genres': [genre['name'] for genre in tmdbMovie['genres']],
                   'release_date': releaseDate,
                   'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None,
                   'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None,
                   'location_entities': tmdbMovie['location_entities'] if 'location_entities' in tmdbMovie else [],
                   'location': tmdbMovie['location'] if 'location' in tmdbMovie else [],
                   'location_city': tmdbMovie['location_city'] if 'location_city' in tmdbMovie else [],
                   'location_state': tmdbMovie['location_state'] if 'location_state' in tmdbMovie else [],
                   'location_country': tmdbMovie['location_country'] if 'location_country' in tmdbMovie else [],
                   }
        except KeyError as k: # Ignore any movies missing these attributes
            continue

### Get the overview texts

We're going to use the overview text to train lowercase entities for our queries.

This is free training data!  We'll take the entities from the true-cased overview text, and apply them to the lower-cased text, and train a new model

In [3]:
def overviews(filename="../tmdb.json"):
    #Get all the overviews
    texts = []
    for movie_id, movie in tmdbMovies(filename=filename):
        if "overview" in movie and isinstance(movie["overview"], str) and len(movie["overview"]):
            texts.append(movie["overview"])
    return texts

In [4]:
overview_texts = overviews()

In [5]:
#For all the overviews in all the movies, find their entities, and test if they are found when lowercased
def get_ent_info(span):
    return [ent.text+'_'+str(ent[0].i)+'_'+str(ent[0].i+len(ent.text))+'_'+ent.label_ for ent in span.ents]
def show_lowercase_entity_differences(texts):
    for doc in largenlp.pipe(texts):
        ents_orig = get_ent_info(doc)
        lowertext = largenlp(doc.text.lower())
        ents_lower = get_ent_info(lowertext)
        if len(ents_orig) != len(ents_lower):
            print(doc.text)
            print(ents_orig,ents_lower)
        print('-----------------------------')
show_lowercase_entity_differences(overview_texts[0:100])        

This feature-length special consists of three interwoven stories. In a mysterious and remote snowy outpost, Matt and Potter share a Christmas meal, swapping creepy tales of their earlier lives in the outside world. Matt is a charismatic American trying to bring the reserved, secretive Potter out of his shell. But are both men who they appear to be? A woman gets thrust into a nightmarish world of 'smart' gadgetry. Plus a look at what would happen if you could 'block' people in real life.
['three_7_12_CARDINAL', 'Matt_19_23_PERSON', 'Christmas_24_33_DATE', 'Matt_39_43_PERSON', 'American_43_51_NORP'] ['three_7_12_CARDINAL', 'american_43_51_NORP']
-----------------------------
Raj is a rich, carefree, happy-go-lucky second generation NRI. Simran is the daughter of Chaudhary Baldev Singh, who in spite of being an NRI is very strict about adherence to Indian values. Simran has left for India to be married to her childhood fiancé. Raj leaves for India with a mission at his hands, to claim his

-----------------------------
-----------------------------
Professional photographer L.B. "Jeff" Jeffries breaks his leg while getting an action shot at an auto race. Confined to his New York apartment, he spends his time looking out of the rear window observing the neighbors. He begins to suspect that a man across the courtyard may have murdered his wife. Jeff enlists the help of his high society fashion-consultant girlfriend Lisa Freemont and his visiting nurse Stella to investigate.
['L.B. "Jeff" Jeffries_2_22_PERSON', 'New York_23_31_GPE', 'Jeff_57_61_PERSON', 'Lisa Freemont_69_82_PERSON', 'Stella_75_81_PERSON'] []
-----------------------------
A former Prohibition-era Jewish gangster returns to the Lower East Side of Manhattan over thirty years later, where he once again must confront the ghosts and regrets of his old life.
['Jewish_5_11_NORP', 'the Lower East Side of Manhattan_9_41_LOC', 'thirty years later_16_34_DATE'] ['jewish_5_11_NORP', 'thirty years later_16_34_DATE']
-----

1930s Korea, in the period of Japanese occupation, a new girl (Sookee) is hired as a handmaiden to a Japanese heiress (Hideko) who lives a secluded life on a large countryside estate with her domineering Uncle (Kouzuki). But the maid has a secret. She is a pickpocket recruited by a swindler posing as a Japanese Count to help him seduce the Lady to elope with him, rob her of her fortune, and lock her up in a madhouse. The plan seems to proceed according to plan until Sookee and Hideko discover some unexpected emotions.
['Korea_1_6_GPE', 'Japanese_7_15_NORP', 'Sookee_14_20_NORP', 'Japanese_23_31_NORP', 'Uncle_41_46_PERSON', 'Kouzuki_43_50_PERSON', 'Japanese_64_72_NORP', 'Sookee and Hideko_100_117_FAC'] ['1930s_0_5_DATE', 'japanese_7_15_NORP', 'japanese_23_31_NORP', 'japanese_64_72_NORP']
-----------------------------
A French actress filming an anti-war film in Hiroshima has an affair with a married Japanese architect as they share their differing perspectives on war.
['French_1_7_NORP',

In [6]:
#For all the overviews in all the movies, find their entities, and assign them to the lowercase text.
#...this takes about 5 mins to run
def get_ents(span):
    return [(ent[0].i,ent[0].i+len(ent.text),ent.label_) for ent in span.ents]
def make_training_data(texts):
    training_data = []
    for doc in largenlp.pipe(texts):
        ents_orig = get_ents(doc)
        lowertext = doc.text.lower()
        training_data.append((lowertext, {"entities":ents_orig}))
    return training_data
training_data = make_training_data(overview_texts)

In [8]:
print(len(training_data))
training_data[0:100]

27449


[("this feature-length special consists of three interwoven stories. in a mysterious and remote snowy outpost, matt and potter share a christmas meal, swapping creepy tales of their earlier lives in the outside world. matt is a charismatic american trying to bring the reserved, secretive potter out of his shell. but are both men who they appear to be? a woman gets thrust into a nightmarish world of 'smart' gadgetry. plus a look at what would happen if you could 'block' people in real life.",
  {'entities': [(7, 12, 'CARDINAL'),
    (19, 23, 'PERSON'),
    (24, 33, 'DATE'),
    (39, 43, 'PERSON'),
    (43, 51, 'NORP')]}),
 ('raj is a rich, carefree, happy-go-lucky second generation nri. simran is the daughter of chaudhary baldev singh, who in spite of being an nri is very strict about adherence to indian values. simran has left for india to be married to her childhood fiancé. raj leaves for india with a mission at his hands, to claim his lady love under the noses of her whole family. th

In [10]:
training_set = training_data[:20000]
testing_set = training_data[20000:]

In [11]:
#Train the model, which takes about 10 minutes
def train_new_model(training_data):
    nlp = spacy.blank("en")
    optimizer = nlp.begin_training()
    n_iter=20
    for i in range(n_iter):
        print('Epoch:',i,'of',n_iter)
        random.shuffle(training_data)
        for text, annotations in training_data:
            nlp.update([text], [annotations], sgd=optimizer)
    nlp.to_disk("tmdb_toy_query_model")

In [12]:
train_new_model(training_set)

### Testing our new model
Load the model from disk that we created, and see if it can recognize lowercase entities (it doesn't, why not?):

In [16]:
#For all the overviews in all the movies, find their entities, and assign them to the lowercase text.
#...this takes about 5 mins to run
def test_lowercase_model(model,data):
    nlp = spacy.load(model)
    print("Model loaded.")
    texts = data[0:len(data)]
    for i in range(len(texts)):
        texts[i][1]['i']=i
    total = len(texts)
    print("Total examples:",total)
    yes=0
    no=0
    for doc, context in largenlp.pipe(texts,as_tuples=True):
        if len(doc.ents) == len(context['entities']):
            yes+=1
        else:
            no+=1
    print('Successful:',yes,'(',yes/total,'%)')
    print('Failed:',no,'(',no/total,'%)')
test_lowercase_model('./tmdb_toy_query_model',testing_set)

Model loaded.
Total examples: 7449
Successful: 1529 ( 0.20526245133574977 %)
Failed: 5920 ( 0.7947375486642503 %)


In [17]:
querynlp = spacy.load('./tmdb_toy_query_model')
querytext = "boxing revenge in moscow"
print('Large model:',largenlp(querytext).ents)
print('New model:',querynlp(querytext).ents)

Large model: (moscow,)
New model: ()


### Transfer learning

We need to continue training using the existing *en_core_web_lg* model, so that we don't lose everything its already learned!

In [22]:
def prep_transfer_learning(model,training_data):
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
    #Get the ner pipe so we can add labels
    ner = nlp.get_pipe("ner")
    # add labels
    for _, annotations in training_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return nlp,ner

In [23]:
def train_existing_model(nlp,ner,training_data):
    output_dir="./tmdb_query_model"
    n_iter=20    
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(training_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(training_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # save model to output directory
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [24]:
transnlp,transner = prep_transfer_learning("en_core_web_lg",training_set)

Loaded model 'en_core_web_lg'


In [26]:
train_existing_model(transnlp,transner,training_set[0:10000])

Losses {'ner': 56029.58138613978}
Losses {'ner': 56195.976643764276}
Losses {'ner': 55411.140806146934}
Losses {'ner': 55533.25823736716}
Losses {'ner': 54933.09801269774}
Losses {'ner': 55623.70507423552}
Losses {'ner': 54717.66931052346}


KeyboardInterrupt: 

In [None]:
test_lowercase_model('./tmdb_query_model',testing_set)