In [1]:
import spacy
import pandas
import random
import datetime
import json
import math
import dateutil.parser
from pathlib import Path
from spacy.util import minibatch, compounding

largenlp = spacy.load('en_core_web_lg')

### JSON Content utlities for the TMDB corpus

In [2]:
#Iterate through the movies
def rawTmdbMovies(filename):
    return json.load(open(filename))


def writeTmdmMovies(rawMoviesJson, path):
    with open(path, 'w') as f:
        json.dump(rawMoviesJson, f)

def tmdbMovies(filename="../tmdb.json"):
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        yield (movieId, tmdbMovie)

def indexableMovies(filename="../tmdb.json"):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    for movieId, tmdbMovie in tmdbMovies(filename):
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date'] + 'T00:00:00Z'

            yield {'id': movieId,
                   'title': tmdbMovie['title'],
                   'overview': tmdbMovie['overview'],
                   'tagline': tmdbMovie['tagline'],
                   'directors': [director['name'] for director in tmdbMovie['directors']],
                   'cast': [castMember['name'] for castMember in tmdbMovie['cast']],
                   'genres': [genre['name'] for genre in tmdbMovie['genres']],
                   'release_date': releaseDate,
                   'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None,
                   'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None,
                   'location_entities': tmdbMovie['location_entities'] if 'location_entities' in tmdbMovie else [],
                   'location': tmdbMovie['location'] if 'location' in tmdbMovie else [],
                   'location_city': tmdbMovie['location_city'] if 'location_city' in tmdbMovie else [],
                   'location_state': tmdbMovie['location_state'] if 'location_state' in tmdbMovie else [],
                   'location_country': tmdbMovie['location_country'] if 'location_country' in tmdbMovie else [],
                   }
        except KeyError as k: # Ignore any movies missing these attributes
            continue

### Get the overview texts

We're going to use the overview text to train lowercase entities for our queries.

This is free training data!  We'll take the entities from the true-cased overview text, and apply them to the lower-cased text, and train a new model

In [3]:
def overviews(filename="../tmdb.json"):
    #Get all the overviews
    texts = []
    for movie_id, movie in tmdbMovies(filename=filename):
        if "overview" in movie and isinstance(movie["overview"], str) and len(movie["overview"]):
            texts.append(movie["overview"])
    return texts

In [4]:
overview_texts = overviews()

In [6]:
#For all the overviews in all the movies, find their entities, and assign them to the lowercase text.
#...this takes about 5 mins to run
def make_training_data(texts):
    sentences = []
    location_labels = []
    person_labels = []
    for doc in largenlp.pipe(texts):
        for sent in doc.sents:
            lowertext = sent.text.lower()
            sentences.append(lowertext)
            location = 0
            person = 0
            for ent in sent.ents:
                if ent.label_ == "GPE":
                    location = 1
                if ent.label_ == "PERSON":
                    person = 1
            location_labels.append(location)
            person_labels.append(person)
    df = pandas.DataFrame({'islocation':location_labels,'isperson':person_labels,'sentence':sentences})
    return df
training_data = make_training_data(overview_texts)

In [7]:
print(len(training_data))
training_data[0:100]

76029


Unnamed: 0,islocation,isperson,sentence
0,0,0,this feature-length special consists of three ...
1,0,1,"in a mysterious and remote snowy outpost, matt..."
2,0,1,matt is a charismatic american trying to bring...
3,0,0,but are both men who they appear to be?
4,0,0,a woman gets thrust into a nightmarish world o...
5,0,0,plus a look at what would happen if you could ...
6,0,0,"raj is a rich, carefree, happy-go-lucky second..."
7,0,1,simran is the daughter of chaudhary baldev sin...
8,1,1,simran has left for india to be married to her...
9,1,0,raj leaves for india with a mission at his han...


In [8]:
training_data.to_csv('spacy_lowercase_entity_labels.csv',index=False)