In [1]:
import pysolr
import spacy
import requests
import datetime
import json
import math
import dateutil.parser

nlp = spacy.load('en_core_web_lg') #Use the large model, it works the best

### Utilities

In [2]:
#Deduplicates a list
def dedup(arr):
    uniques = set(arr)
    return list(uniques)

#Disambiguate to the nearest lat/lng whole numbers:
def near(loc):
    latlng = loc["latLng"]
    return round(latlng["lat"]),round(latlng["lng"])

#Converts a mapquest geocoded location into a normalized City|State|Country string
def norm(loc):
    loctypes = {"City":"","State":"","Country":""}
    location = None
    for i in range(8):
        aai = "adminArea"+str(i)
        aat = aai+"Type"
        if aat in loc.keys() and loc[aat] in loctypes.keys():
            loctypes[loc[aat]] = aai
    for typ in loctypes.keys():
        fld = loctypes[typ]
        if fld in loc.keys() and len(loc[fld]):
            if not location:
                location = loc[fld]
            else:
                location += "|" + loc[fld]
    if not location:
        location = "_UNKNOWN_"
    return location.replace(" ", "_")

# Converts a mapquest geocoded location into a City, State, Country dictionary
def city_state_country(loc):
    loctypes = {"City":"","State":"","Country":""}
    location = {"City":"","State":"","Country":""}
    for i in range(8):
        aai = "adminArea"+str(i)
        aat = aai+"Type"
        if aat in loc.keys() and loc[aat] in loctypes.keys():
            loctypes[loc[aat]] = aai
    for typ in loctypes.keys():
        fld = loctypes[typ]
        if fld in loc.keys() and len(loc[fld]):
            location[typ] = loc[fld]
    return location

### Makes requests to the Mapquest service

In [3]:
#MapQuest Geocode service.  More info: https://developer.mapquest.com/documentation/geocoding-api/
mapquest_address_url="https://www.mapquestapi.com/geocoding/v1/address?key=AuqdPFEWYhm7rZRN5hX5HeWSKgaO2u7d&location="
def geocode(text):
    req = requests.get(mapquest_address_url+text)
    jsn = req.json()
    loc = jsn["results"][0]["locations"][0]
    return loc

# Send batches to the MapQuest Geocode API in up to 100 locations at a time
# Saves on API monthly rate (15000 calls per month in the free tier)
#mapquest_batch_url="http://www.mapquestapi.com/geocoding/v1/batch?&inFormat=json&outFormat=json&key=AuqdPFEWYhm7rZRN5hX5HeWSKgaO2u7d"
def geocode_batch(batch):
    mapquest_batch_url="http://www.mapquestapi.com/geocoding/v1/batch?key=AuqdPFEWYhm7rZRN5hX5HeWSKgaO2u7d&maxResults=1&thumbMaps=false"
    request_locations = []
    reverse_lookup = {}
    response = {}
    
    #Create reverse lookup table for the API response
    for movie in batch:
        movie_id = movie["id"]
        location = movie["location"]
        if movie_id not in response:
            response[movie_id] = []
        if location not in reverse_lookup:
            reverse_lookup[location] = []
        reverse_lookup[location].append(movie_id)
        mapquest_batch_url += "&location=" + location
    req = requests.get(mapquest_batch_url)
    jsn = req.json()

    # Map the API location results to the movie IDs
    for result in jsn["results"]:
        if "providedLocation" in result and "locations" in result and len(result["locations"]):
            provided = result["providedLocation"]["location"]
            location = result["locations"][0]
            if location["geocodeQuality"] != "COUNTRY" or location["adminArea1"] != "US":
                if provided in reverse_lookup:
                    for movie_id in reverse_lookup[provided]:
                        response[movie_id].append(location)
                else:
                    print("Not found",provided)
                
    return response

### Get entities with SpaCy

In [4]:
# Use a basic finite state machine to chunk proper noun GPEs as one location
# This had the best F1 score from our tests in the test_location_entities notebook
def extract_chunked_locations(text):
    doc = nlp(text)
    gpes = []
    for s in doc.sents:
        curr = []
        isgpe = False
        for t in s:
            if (t.pos_ == 'PROPN' and t.ent_type_ == 'GPE') or (isgpe == True and t.text==','):
                isgpe = True
                if t.text!=',':
                    curr.append(t.text)
            elif isgpe == True:
                gpes.append(' '.join(curr))
                curr = []
                isgpe = False
            else:
                isgpe = False
        if(len(curr)):
            gpes.append(' '.join(curr))
            curr = []
            isgpe = False
    return gpes

### JSON Content utlities for the TMDB corpus

In [5]:
#Iterate through the movies
def rawTmdbMovies(filename):
    return json.load(open(filename))


def writeTmdmMovies(rawMoviesJson, path):
    with open(path, 'w') as f:
        json.dump(rawMoviesJson, f)

def tmdbMovies(filename="../tmdb.json"):
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        yield (movieId, tmdbMovie)

def indexableMovies(filename="../tmdb.json"):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    for movieId, tmdbMovie in tmdbMovies(filename):
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date'] + 'T00:00:00Z'

            yield {'id': movieId,
                   'title': tmdbMovie['title'],
                   'overview': tmdbMovie['overview'],
                   'tagline': tmdbMovie['tagline'],
                   'directors': [director['name'] for director in tmdbMovie['directors']],
                   'cast': [castMember['name'] for castMember in tmdbMovie['cast']],
                   'genres': [genre['name'] for genre in tmdbMovie['genres']],
                   'release_date': releaseDate,
                   'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None,
                   'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None,
                   'location_entities': tmdbMovie['location_entities'] if 'location_entities' in tmdbMovie else [],
                   'location': tmdbMovie['location'] if 'location' in tmdbMovie else [],
                   'location_city': tmdbMovie['location_city'] if 'location_city' in tmdbMovie else [],
                   'location_state': tmdbMovie['location_state'] if 'location_state' in tmdbMovie else [],
                   'location_country': tmdbMovie['location_country'] if 'location_country' in tmdbMovie else [],
                   }
        except KeyError as k: # Ignore any movies missing these attributes
            continue

### Gets all the GPEs in the corpus movies titles and overviews

In [6]:
#For all the movies, finds all the entities in titles and overviews, and adds them to the respective movie record
def entitize_movies():
    tmdb_spacy = {}
    for movie_id, movie in tmdbMovies(filename="../tmdb.json"):
        title_entities = []
        overview_entities = []
        if "title" in movie and isinstance(movie["title"], str) and len(movie["title"]):
            title_entities = extract_chunked_locations(movie["title"])

        if "overview" in movie and isinstance(movie["overview"], str) and len(movie["overview"]):
            overview_entities = extract_chunked_locations(movie["overview"])

        movie["location_entities"] = dedup(title_entities + overview_entities)
        tmdb_spacy[movie_id] = movie
    writeTmdmMovies(tmdb_spacy,'../tmdb_spacy.json')

### Geocodes all the GPEs for all movies in batches, and enriches the corpus with the results

In [7]:
#For all the movies, lookup the entities in mapquest, and enrich the record for Solr
def enrich_movies():
    batches = []
    tmdb_enriched = {}
    
    #Prep the batches for the mapquest API, and init the movies
    for movie_id, movie in tmdbMovies(filename="../tmdb_spacy.json"):
        if "location_entities" in movie and movie["location_entities"] and len(movie["location_entities"]):
            #Movie has GPE entities - add them to the mapquest batch
            location_entities = movie["location_entities"]
            movie["location"] = []
            movie["location_city"] = []
            movie["location_state"] = []
            movie["location_country"] = []
            for entity in location_entities:
                batches.append({"id":movie_id,"location":entity})
        tmdb_enriched[movie_id]=movie

    #Enrich all the movies in batches of 100, using the mapquest batch API
    total = len(batches)
    batch_step = 100
    batch_nums = math.ceil(total/batch_step)
    for i in range(0,batch_step*batch_nums,batch_step):
        batch = batches[i:min(total,i+batch_step)]
        geocoded = geocode_batch(batch)
        for movie_id in geocoded:
            for loc in geocoded[movie_id]:
                csc = city_state_country(loc)
                tmdb_enriched[movie_id]["location"].append(str(loc["latLng"]["lat"]) + "," + str(loc["latLng"]["lng"]))
                if len(csc["City"]):
                    tmdb_enriched[movie_id]["location_city"].append(csc["City"])
                if len(csc["State"]):
                    tmdb_enriched[movie_id]["location_state"].append(csc["State"])
                if len(csc["Country"]):
                    tmdb_enriched[movie_id]["location_country"].append(csc["Country"])

    for movie_id in tmdb_enriched:
        tmdb_enriched[movie_id]["location"] = dedup(tmdb_enriched[movie_id]["location"])
        tmdb_enriched[movie_id]["location_city"] = dedup(tmdb_enriched[movie_id]["location_city"])
        tmdb_enriched[movie_id]["location_state"] = dedup(tmdb_enriched[movie_id]["location_state"])
        tmdb_enriched[movie_id]["location_country"] = dedup(tmdb_enriched[movie_id]["location_country"])
        
    writeTmdmMovies(tmdb_enriched,'../tmdb_enriched.json')

### Deduplicates each movies location field data in the TMDB corpus

In [8]:
def dedup_movie_locations():
    tmdb_enriched = {}
    for movie_id, movie in tmdbMovies(filename="../tmdb_enriched.json"):
        if "location" in movie and len(movie["location"]):
            movie["location"] = dedup(movie["location"])
            
        if "location_city" in movie and len(movie["location_city"]):
            movie["location_city"] = dedup(movie["location_city"])
            
        if "location_state" in movie and len(movie["location_state"]):
            movie["location_state"] = dedup(movie["location_state"])
            
        if "location_country" in movie and len(movie["location_country"]):
            movie["location_country"] = dedup(movie["location_country"])
            
        tmdb_enriched[movie_id] = movie
    writeTmdmMovies(tmdb_enriched,'../tmdb_enriched_deduped.json')
dedup_movie_locations()

In [9]:
# Extract and geocode the location entities in the corpus (done offline)
#entitize_movies()
#enrich_movies()

In [None]:
def print_tmdb_entities():
    mc=0
    lc=0
    for movie in indexableMovies(filename='../tmdb_spacy.json'):
        mc+=1
        if "location_entities" in movie and len(movie["location_entities"]):
            lc+=1
            print("-----------------")
            print(movie["title"])
            print(movie["location_entities"])
            
def print_tmdb_locations():
    mc=0
    lc=0
    for movie in indexableMovies(filename='../tmdb_enriched_deduped.json'):
        mc+=1
        if "location" in movie and len(movie["location"]):
            lc+=1
            print("-----------------")
            print(movie["title"])
            print(movie["location"])
            print(movie["location_city"])
            print(movie["location_state"])
            print(movie["location_country"]) 