In [1]:
import spacy
import random
import datetime
import json
import math
import dateutil.parser
from spacy.matcher import Matcher

largenlp = spacy.load('en_core_web_lg')

### JSON Content utlities for the TMDB corpus

In [2]:
#Iterate through the movies
def rawTmdbMovies(filename):
    return json.load(open(filename))


def writeTmdmMovies(rawMoviesJson, path):
    with open(path, 'w') as f:
        json.dump(rawMoviesJson, f)

def tmdbMovies(filename="../tmdb.json"):
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        yield (movieId, tmdbMovie)

def indexableMovies(filename="../tmdb.json"):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    for movieId, tmdbMovie in tmdbMovies(filename):
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date'] + 'T00:00:00Z'

            yield {'id': movieId,
                   'title': tmdbMovie['title'],
                   'overview': tmdbMovie['overview'],
                   'tagline': tmdbMovie['tagline'],
                   'directors': [director['name'] for director in tmdbMovie['directors']],
                   'cast': [castMember['name'] for castMember in tmdbMovie['cast']],
                   'genres': [genre['name'] for genre in tmdbMovie['genres']],
                   'release_date': releaseDate,
                   'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None,
                   'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None,
                   'location_entities': tmdbMovie['location_entities'] if 'location_entities' in tmdbMovie else [],
                   'location': tmdbMovie['location'] if 'location' in tmdbMovie else [],
                   'location_city': tmdbMovie['location_city'] if 'location_city' in tmdbMovie else [],
                   'location_state': tmdbMovie['location_state'] if 'location_state' in tmdbMovie else [],
                   'location_country': tmdbMovie['location_country'] if 'location_country' in tmdbMovie else [],
                   }
        except KeyError as k: # Ignore any movies missing these attributes
            continue

### Get the overview texts

We're going to use the overview text to train lowercase entities for our queries.

This is free training data!  We'll take the entities from the true-cased overview text, and apply them to the lower-cased text, and train a new model

In [3]:
def overviews(filename="../tmdb.json"):
    #Get all the overviews
    texts = []
    for movie_id, movie in tmdbMovies(filename=filename):
        if "overview" in movie and isinstance(movie["overview"], str) and len(movie["overview"]):
            texts.append(movie["overview"])
    return texts

In [4]:
overview_texts = overviews()

## Use Lexico-syntactic patterns

Lexico-syntactic patterns use parts-of-speech to look for common structures in text.  For example a verb followed by a proper noun.  They can give good results but are very basic and only work when you know the patterns ahead of time.  For more information on this topic, see 'Hearst Patterns' named after Marti Hearst, who first used the idea in the 1980s to expand early versions of wordnet.

In [17]:
def find_pattern(texts,name,pattern):
    matcher = Matcher(largenlp.vocab)
    matcher.add(name, None, pattern)
    matches = []
    for doc in largenlp.pipe(texts):
        for match in matcher(doc):
            example = [tok.text for tok in doc[match[1]:match[2]]]
            matches.append(example)
    return matches

In [18]:
verbpnouns = find_pattern(overview_texts,"VerbPNouns",[
        {"POS": "VERB"}, 
        {"LABEL": ["GPE","PERSON"]},
        {"LABEL": ["GPE","PERSON"],"OP":"+"}
    ]
)

In [19]:
verbpnouns

[['share', 'a', 'Christmas'],
 ['share', 'a', 'Christmas', 'meal'],
 ['share', 'a', 'Christmas', 'meal', ','],
 ['share', 'a', 'Christmas', 'meal', ',', 'swapping'],
 ['share', 'a', 'Christmas', 'meal', ',', 'swapping', 'creepy'],
 ['share', 'a', 'Christmas', 'meal', ',', 'swapping', 'creepy', 'tales'],
 ['swapping', 'creepy', 'tales'],
 ['share', 'a', 'Christmas', 'meal', ',', 'swapping', 'creepy', 'tales', 'of'],
 ['swapping', 'creepy', 'tales', 'of'],
 ['share',
  'a',
  'Christmas',
  'meal',
  ',',
  'swapping',
  'creepy',
  'tales',
  'of',
  'their'],
 ['swapping', 'creepy', 'tales', 'of', 'their'],
 ['share',
  'a',
  'Christmas',
  'meal',
  ',',
  'swapping',
  'creepy',
  'tales',
  'of',
  'their',
  'earlier'],
 ['swapping', 'creepy', 'tales', 'of', 'their', 'earlier'],
 ['share',
  'a',
  'Christmas',
  'meal',
  ',',
  'swapping',
  'creepy',
  'tales',
  'of',
  'their',
  'earlier',
  'lives'],
 ['swapping', 'creepy', 'tales', 'of', 'their', 'earlier', 'lives'],
 ['sh

In [15]:
pnounsverb = find_pattern(overview_texts,"PNounsVerb",[
        {"LABEL": ["GPE","PERSON"]},
        {"LABEL": ["GPE","PERSON"]},
        {"POS": "VERB"}    
    ]
)

In [16]:
pnounsverb

[['and', 'Potter', 'share'],
 ['meal', ',', 'swap'],
 ['.', 'Matt', 'be'],
 ['charismatic', 'American', 'try'],
 ['try', 'to', 'bring'],
 ['bring', 'the', 'reserve'],
 ['.', 'but', 'be'],
 ['who', '-PRON-', 'appear'],
 ['appear', 'to', 'be'],
 ['a', 'woman', 'get'],
 ['at', 'what', 'would'],
 ['what', 'would', 'happen'],
 ['if', '-PRON-', 'could'],
 ['could', "'", 'block'],
 ['happy', '-', 'go'],
 ['.', 'Simran', 'be'],
 ['spite', 'of', 'be'],
 ['an', 'NRI', 'be'],
 ['.', 'Simran', 'have'],
 ['Simran', 'have', 'leave'],
 ['India', 'to', 'be'],
 ['.', 'Raj', 'leave'],
 [',', 'to', 'claim'],
 ['.', 'thus', 'begin'],
 ['Andy', 'Dufresne', 'begin'],
 ['where', '-PRON-', 'put'],
 ['skill', 'to', 'work'],
 [',', 'Dufresne', 'come'],
 ['come', 'to', 'be'],
 ['to', 'be', 'admire'],
 ['inmate', '--', 'include'],
 ['old', 'prisoner', 'name'],
 ['and', 'Taki', 'be'],
 ['complete', 'stranger', 'live'],
 ['-PRON-', 'suddenly', 'switch'],
 ['.', 'Mitsuha', 'wake'],
 ['bizarre', 'occurrence', 'contin

## Using the Root dependency

First, find the root predicate, then find the entities that are directly referenced by the root.  This can be more reliable than basic pattern matching, when the sentence structure is simple, but doesn't work well for long sentences with lots of modifiers and clauses.

In [10]:
#For all the overviews in all the movies, find their entities, and assign them to the lowercase text.
#...this takes about 5 mins to run
def get_query_ents(span):
    return [(ent[0].i,ent[0].i+len(ent.text),ent.label_) for ent in span.ents]
def root_dependents(texts):
    matches = []
    for doc in largenlp.pipe(texts):
        for sent in doc.sents:
            predicate = None
            ents_orig = []
            for tok in sent:
                if tok.dep_ == "ROOT":
                    predicate = tok
            for ent in sent.ents:
                for tok in ent:
                    if ent.label_ in ["GPE","PERSON"] and tok.head.text==predicate.text:
                        ents_orig.append(ent)
            if(len(ents_orig)):
                lowertext = predicate.text.lower() # + '|' + sent.text
                matches.append((lowertext, {"entities":ents_orig}))
    return matches

In [11]:
predictate_objects = root_dependents(overview_texts)
predictate_objects

[('share', {'entities': [Matt]}),
 ('is', {'entities': [Matt]}),
 ('is', {'entities': [Simran]}),
 ('left', {'entities': [Simran]}),
 ('begins', {'entities': [Andy Dufresne]}),
 ('comes', {'entities': [Dufresne]}),
 ('are', {'entities': [Mitsuha]}),
 ('wakes', {'entities': [Mitsuha]}),
 ('steps', {'entities': [Michael]}),
 ('is', {'entities': [Dangal]}),
 ('is', {'entities': [Oscar]}),
 ('grows', {'entities': [Vito Corleone]}),
 ('attempts', {'entities': [Michael Corleone]}),
 ('face', {'entities': [Sherlock]}),
 ('sets', {'entities': [Jim Gordon]}),
 ('is', {'entities': [Jumpei Niki]}),
 ('inspires', {'entities': [Randle Patrick McMurphy]}),
 ('is', {'entities': [Forrest Gump']}),
 ('is', {'entities': [Darth Vader]}),
 ('thrown', {'entities': [Princess Leia]}),
 ('cursed', {'entities': [Ashitaka]}),
 ('encounters', {'entities': [San, Lady Eboshi]}),
 ('find', {'entities': [Ashitaka]}),
 ('paroled', {'entities': [Derek Vineyard]}),
 ('severs', {'entities': [Derek]}),
 ('breaks', {'enti