In [1]:
import spacy
import random
import datetime
import json
import math
import dateutil.parser

nlp = spacy.load('en_core_web_lg') #Use the large model, it works the best

### JSON Content utlities for the TMDB corpus

In [2]:
#Iterate through the movies
def rawTmdbMovies(filename):
    return json.load(open(filename))


def writeTmdmMovies(rawMoviesJson, path):
    with open(path, 'w') as f:
        json.dump(rawMoviesJson, f)

def tmdbMovies(filename="../tmdb.json"):
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        yield (movieId, tmdbMovie)

def indexableMovies(filename="../tmdb.json"):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    for movieId, tmdbMovie in tmdbMovies(filename):
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date'] + 'T00:00:00Z'

            yield {'id': movieId,
                   'title': tmdbMovie['title'],
                   'overview': tmdbMovie['overview'],
                   'tagline': tmdbMovie['tagline'],
                   'directors': [director['name'] for director in tmdbMovie['directors']],
                   'cast': [castMember['name'] for castMember in tmdbMovie['cast']],
                   'genres': [genre['name'] for genre in tmdbMovie['genres']],
                   'release_date': releaseDate,
                   'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None,
                   'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None,
                   'location_entities': tmdbMovie['location_entities'] if 'location_entities' in tmdbMovie else [],
                   'location': tmdbMovie['location'] if 'location' in tmdbMovie else [],
                   'location_city': tmdbMovie['location_city'] if 'location_city' in tmdbMovie else [],
                   'location_state': tmdbMovie['location_state'] if 'location_state' in tmdbMovie else [],
                   'location_country': tmdbMovie['location_country'] if 'location_country' in tmdbMovie else [],
                   }
        except KeyError as k: # Ignore any movies missing these attributes
            continue

### Get the overview texts

We're going to use the overview text to train lowercase entities for our queries.

This is free training data!  We'll take the entities from the true-cased overview text, and apply them to the lower-cased text, and train a new model

In [4]:
def overviews(filename="../tmdb.json"):
    #Get all the overviews
    texts = []
    for movie_id, movie in tmdbMovies(filename=filename):
        if "overview" in movie and isinstance(movie["overview"], str) and len(movie["overview"]):
            texts.append(movie["overview"])
    return texts

In [7]:
overview_texts = overviews()

In [23]:
#For all the overviews in all the movies, find their entities, and test if they are found when lowercased
def get_ent_info(span):
    return [ent.text+'_'+str(ent.start)+'_'+str(ent.end)+'_'+ent.label_ for ent in span.ents]
def show_lowercase_entity_differences(texts):
    for doc in nlp.pipe(texts):
        ents_orig = get_ent_info(doc)
        lowertext = nlp(doc.text.lower())
        ents_lower = get_ent_info(lowertext)
        if len(ents_orig) != len(ents_lower):
            print(doc.text)
            print(ents_orig,ents_lower)
        print('-----------------------------')
show_lowercase_entity_differences(overview_texts[0:100])        

This feature-length special consists of three interwoven stories. In a mysterious and remote snowy outpost, Matt and Potter share a Christmas meal, swapping creepy tales of their earlier lives in the outside world. Matt is a charismatic American trying to bring the reserved, secretive Potter out of his shell. But are both men who they appear to be? A woman gets thrust into a nightmarish world of 'smart' gadgetry. Plus a look at what would happen if you could 'block' people in real life.
['three_7_8_CARDINAL', 'Matt_19_20_PERSON', 'Christmas_24_25_DATE', 'Matt_39_40_PERSON', 'American_43_44_NORP'] ['three_7_8_CARDINAL', 'american_43_44_NORP']
-----------------------------
Raj is a rich, carefree, happy-go-lucky second generation NRI. Simran is the daughter of Chaudhary Baldev Singh, who in spite of being an NRI is very strict about adherence to Indian values. Simran has left for India to be married to her childhood fiancé. Raj leaves for India with a mission at his hands, to claim his l

-----------------------------
-----------------------------
Professional photographer L.B. "Jeff" Jeffries breaks his leg while getting an action shot at an auto race. Confined to his New York apartment, he spends his time looking out of the rear window observing the neighbors. He begins to suspect that a man across the courtyard may have murdered his wife. Jeff enlists the help of his high society fashion-consultant girlfriend Lisa Freemont and his visiting nurse Stella to investigate.
['L.B. "Jeff" Jeffries_2_7_PERSON', 'New York_23_25_GPE', 'Jeff_57_58_PERSON', 'Lisa Freemont_69_71_PERSON', 'Stella_75_76_PERSON'] []
-----------------------------
A former Prohibition-era Jewish gangster returns to the Lower East Side of Manhattan over thirty years later, where he once again must confront the ghosts and regrets of his old life.
['Jewish_5_6_NORP', 'the Lower East Side of Manhattan_9_15_LOC', 'thirty years later_16_19_DATE'] ['jewish_5_6_NORP', 'thirty years later_16_19_DATE']
--------

1930s Korea, in the period of Japanese occupation, a new girl (Sookee) is hired as a handmaiden to a Japanese heiress (Hideko) who lives a secluded life on a large countryside estate with her domineering Uncle (Kouzuki). But the maid has a secret. She is a pickpocket recruited by a swindler posing as a Japanese Count to help him seduce the Lady to elope with him, rob her of her fortune, and lock her up in a madhouse. The plan seems to proceed according to plan until Sookee and Hideko discover some unexpected emotions.
['Korea_1_2_GPE', 'Japanese_7_8_NORP', 'Sookee_14_15_NORP', 'Japanese_23_24_NORP', 'Uncle_41_42_PERSON', 'Kouzuki_43_44_PERSON', 'Japanese_64_65_NORP', 'Sookee and Hideko_100_103_FAC'] ['1930s_0_1_DATE', 'japanese_7_8_NORP', 'japanese_23_24_NORP', 'japanese_64_65_NORP']
-----------------------------
A French actress filming an anti-war film in Hiroshima has an affair with a married Japanese architect as they share their differing perspectives on war.
['French_1_2_NORP', '

In [40]:
#For all the overviews in all the movies, find their entities, and assign them to the lowercase text.
#...this takes about 5 mins to run
def get_ents(span):
    return [(ent.start,ent.end,ent.label_) for ent in span.ents]
def make_training_data(texts):
    training_data = []
    for doc in nlp.pipe(texts):
        ents_orig = get_ents(doc)
        lowertext = doc.text.lower()
        training_data.append((lowertext, {"entities":ents_orig}))
    return training_data
training_data = make_training_data(overview_texts)

In [41]:
training_data[0:100]

[("this feature-length special consists of three interwoven stories. in a mysterious and remote snowy outpost, matt and potter share a christmas meal, swapping creepy tales of their earlier lives in the outside world. matt is a charismatic american trying to bring the reserved, secretive potter out of his shell. but are both men who they appear to be? a woman gets thrust into a nightmarish world of 'smart' gadgetry. plus a look at what would happen if you could 'block' people in real life.",
  {'entities': [(7, 8, 'CARDINAL'),
    (19, 20, 'PERSON'),
    (24, 25, 'DATE'),
    (39, 40, 'PERSON'),
    (43, 44, 'NORP')]}),
 ('raj is a rich, carefree, happy-go-lucky second generation nri. simran is the daughter of chaudhary baldev singh, who in spite of being an nri is very strict about adherence to indian values. simran has left for india to be married to her childhood fiancé. raj leaves for india with a mission at his hands, to claim his lady love under the noses of her whole family. thu

In [42]:
def train_model(training_data):
    nlp = spacy.blank("en")
    optimizer = nlp.begin_training()
    for i in range(20):
        random.shuffle(training_data)
        for text, annotations in training_data:
            nlp.update([text], [annotations], sgd=optimizer)
    nlp.to_disk("tmdb_query_model")

In [None]:
train_model(training_data)