In [1]:
import pysolr
import spacy
import requests
import datetime
import json
import dateutil.parser
nlp = spacy.load('en_core_web_lg')

### Make a request to the Mapquest service

In [40]:
#MapQuest Geocode service.  More info: https://developer.mapquest.com/documentation/geocoding-api/
mapquest_address_url="https://www.mapquestapi.com/geocoding/v1/address?key=AuqdPFEWYhm7rZRN5hX5HeWSKgaO2u7d&location="
def geocode(text):
    req = requests.get(mapquest_address_url+text)
    jsn = req.json()
    loc = jsn["results"][0]["locations"][0]
    return loc

# Send batches to the MapQuest Geocode API in up to 100 locations at a time
# Saves on API monthly rate (15000 calls per month in the free tier)
#mapquest_batch_url="http://www.mapquestapi.com/geocoding/v1/batch?&inFormat=json&outFormat=json&key=AuqdPFEWYhm7rZRN5hX5HeWSKgaO2u7d"
def geocode_batch(batch):
    mapquest_batch_url="http://www.mapquestapi.com/geocoding/v1/batch?key=AuqdPFEWYhm7rZRN5hX5HeWSKgaO2u7d&maxResults=1&thumbMaps=false"
    request_locations = []
    reverse_lookup = {}
    response = {}
    
    #Create reverse lookup table for the API response
    for movie in batch:
        movie_id = movie["id"]
        location = movie["location"]
        if movie_id not in response:
            response[movie_id] = []
        if location not in reverse_lookup:
            reverse_lookup[location] = []
        reverse_lookup[location].append(movie_id)
        mapquest_batch_url += "&location=" + location
    print(mapquest_batch_url)
    req = requests.get(mapquest_batch_url)
    jsn = req.json()

    # Map the API location results to the movie IDs
    for result in jsn["results"]:
        if "providedLocation" in result and "locations" in result and len(result["locations"]):
            provided = result["providedLocation"]["location"]
            location = result["locations"][0]
            if location["geocodeQuality"] != "COUNTRY" or location["adminArea1"] != "US":
                if provided in reverse_lookup:
                    for movie_id in reverse_lookup[provided]:
                        response[movie_id].append(location)
                else:
                    print("Not found",provided)
                
    return response

enriched_movie_ids = geocode_batch([
    {"id":123,"location":"New York"},
    {"id":123,"location":"Manhattan"},
    {"id":123,"location":"Thailand"},
])

enriched_movie_ids

http://www.mapquestapi.com/geocoding/v1/batch?key=AuqdPFEWYhm7rZRN5hX5HeWSKgaO2u7d&maxResults=1&thumbMaps=false&location=New York&location=Manhattan&location=Thailand


{123: [{'street': '',
   'adminArea6': '',
   'adminArea6Type': 'Neighborhood',
   'adminArea5': 'New York',
   'adminArea5Type': 'City',
   'adminArea4': 'New York County',
   'adminArea4Type': 'County',
   'adminArea3': 'NY',
   'adminArea3Type': 'State',
   'adminArea1': 'US',
   'adminArea1Type': 'Country',
   'postalCode': '',
   'geocodeQualityCode': 'A5XAX',
   'geocodeQuality': 'CITY',
   'dragPoint': False,
   'sideOfStreet': 'N',
   'linkId': '282040974',
   'unknownInput': '',
   'type': 's',
   'latLng': {'lat': 40.713054, 'lng': -74.007228},
   'displayLatLng': {'lat': 40.713054, 'lng': -74.007228}},
  {'street': '',
   'adminArea6': '',
   'adminArea6Type': 'Neighborhood',
   'adminArea5': 'Manhattan',
   'adminArea5Type': 'City',
   'adminArea4': 'New York County',
   'adminArea4Type': 'County',
   'adminArea3': 'NY',
   'adminArea3Type': 'State',
   'adminArea1': 'US',
   'adminArea1Type': 'Country',
   'postalCode': '',
   'geocodeQualityCode': 'A5XAX',
   'geocodeQual

In [3]:
#us-map-with-latitude-longitude-united-states-latitude-longitude.jpg

#Disambiguate to the nearest lat/lng whole numbers:
def near(loc):
    latlng = loc["latLng"]
    return round(latlng["lat"]),round(latlng["lng"])

#Normalize to City|State|Country:
def norm(loc):
    loctypes = {"City":"","State":"","Country":""}
    location = None
    for i in range(8):
        aai = "adminArea"+str(i)
        aat = aai+"Type"
        if aat in loc.keys() and loc[aat] in loctypes.keys():
            loctypes[loc[aat]] = aai
    for typ in loctypes.keys():
        fld = loctypes[typ]
        if fld in loc.keys() and len(loc[fld]):
            if not location:
                location = loc[fld]
            else:
                location += "|" + loc[fld]
    if not location:
        location = "_UNKNOWN_"
    return location.replace(" ", "_")

### Get entities with SpaCy

In [20]:
def extract_independent_locations(text):
    doc = nlp(text)
    locs = []
    #Fun fact, "GPE" means Geo-Political Entity
    for gpe in filter(lambda w: w.ent_type_ in ['GPE'], doc):
        locs.append(gpe.text)
    return locs
extract_independent_locations("I left the New York, NY to visit Thailand.")

['New', 'York', 'NY', 'Thailand']

### Enrich a title with a location using SpaCy and Mapquest

In [12]:
def enrich_content_locations(text):
    enrichment = {"lat":None,"lng":None,"location":None}
    entities = extract_independent_locations(text)
    if len(entities):
        #found a location - look it up in mapquest and disambiguate
        entity = " ".join(entities)
        geoloc = geocode(entity)
        if geoloc:
            latlng = near(geoloc)
            location = norm(geoloc)
            if latlng and len(latlng)==2:
                # It's a valid location! 
                # ...add the filter query and remove the text from the query:
                enrichment["lat"] = latlng[0]
                enrichment["lng"] = latlng[1]
            if location:
                enrichment["location"] = location
    return enrichment
enrich_content_location("Kevin alone in NYC")

{'lat': 41, 'lng': -74, 'location': 'New_York|NY|US'}

In [14]:
#Iterate through the movies
def rawTmdbMovies(filename):
    return json.load(open(filename))


def writeTmdmMovies(rawMoviesJson, path):
    with open(path, 'w') as f:
        json.dump(rawMoviesJson, f)

def tmdbMovies(filename):
    tmdbMovies = rawTmdbMovies(filename)
    for movieId, tmdbMovie in tmdbMovies.items():
        yield (movieId, tmdbMovie)


def indexableMovies(filename="../tmdb.json"):
    """ Generates TMDB movies, similar to how ES Bulk indexing
        uses a generator to generate bulk index/update actions """
    for movieId, tmdbMovie in tmdbMovies(filename):
        try:
            releaseDate = None
            if 'release_date' in tmdbMovie and len(tmdbMovie['release_date']) > 0:
                releaseDate = tmdbMovie['release_date'] + 'T00:00:00Z'

            if 'title_entities' in tmdbMovie and len(tmdbMovie['title_entities']) > 0:                
                yield {'id': movieId,
                       'title': tmdbMovie['title'],
                       'overview': tmdbMovie['overview'],
                       'tagline': tmdbMovie['tagline'],
                       'directors': [director['name'] for director in tmdbMovie['directors']],
                       'cast': [castMember['name'] for castMember in tmdbMovie['cast']],
                       'genres': [genre['name'] for genre in tmdbMovie['genres']],
                       'release_date': releaseDate,
                       'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None,
                       'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None,
                       'title_entities': tmdbMovie['title_entities'] if 'title_entities' in tmdbMovie else [],                   
                       'overview_entities': tmdbMovie['overview_entities'] if 'overview_entities' in tmdbMovie else [],
                       }
            else:
                yield {'id': movieId,
                       'title': tmdbMovie['title'],
                       'overview': tmdbMovie['overview'],
                       'tagline': tmdbMovie['tagline'],
                       'directors': [director['name'] for director in tmdbMovie['directors']],
                       'cast': [castMember['name'] for castMember in tmdbMovie['cast']],
                       'genres': [genre['name'] for genre in tmdbMovie['genres']],
                       'release_date': releaseDate,
                       'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None,
                       'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None,
                       }
        except KeyError as k: # Ignore any movies missing these attributes
            continue

In [41]:
def enrich_movie():
    geoloc = geocode(entity)
    if geoloc:
        latlng = near(geoloc)
        location = norm(geoloc)
        if latlng and len(latlng)==2:
            # It's a valid location! 
            # ...add the filter query and remove the text from the query:
            enrichment["lat"] = latlng[0]
            enrichment["lng"] = latlng[1]
        if location:
            enrichment["location"] = location
    return enrichment

def entitize_movies():
    total = 0
    batch = []
    movies = {}
    tmdb_spacy = []
    tmdb_json = []
    for movie in indexableMovies():
        title_entities = extract_independent_locations(movie["title"])
        overview_entities = extract_independent_locations(movie["overview"])
        movie["title_entities"] = title_entities
        movie["overview_entities"] = overview_entities
        tmdb_spacy.append(movie)
    writeTmdmMovies(tmdb_spacy,'../tmdb_spacy.json')
    

def enrich_movies():
    total = 0
    batch = []
    movies = {}
    tmdb_spacy = []
    tmdb_enriched = []
    for movie in indexableMovies():
        overview_entities = movie["overview_entities"]
        title_entities = movie["title_entities"]
        if len(title_entities) or len(overview_entities):
            #found a location - add it to the mapquest batch
            movies["id"] = movie
            for entity in title_entities:
                batch.append({"id":movie["id"],"location":entity})
                total +=1
            for entity in overview_entities:
                batch.append({"id":movie["id"],"location":entity})
                total += 1
            if (total>=98):
                geocodes = geocode_batch(batch)
                if "location" not in movie:
                    movie["location"] = []
                for movie_id in geocodes.keys():
                    latlon = movie
                    movie["location"].append()
                index_movies(enriched)
                #locations = geocode_batch(batch)
                batch = []
                movies = {}
                total = 0
        movie.pop('title_entities', None)
        movie.pop('overview_entities', None)
        tmdb_enriched.append(movie)
        
    writeTmdmMovies(tmdb_enriched,'../tmdb_enriched.json')
    
enrich_movies()

NameError: name 'title_entities' is not defined

In [9]:
#Solr Client
solr = pysolr.Solr('http://localhost:8983/solr/tmdb')

#Print the Title, Release Date, and Overview from TMDB
def printresults(res):
    for r in res:
        print('\n---')
        print('\n\t'.join([r["title"][0],r["lat"],r["lng"],r["overview"][0]]))

def solrquery(q):
    return "title_en:("+q+")^1.2 overview_en:("+q+")"
        
#Enrich and Search a text query
def search(query):
    enriched = enrich_query_location(query)
    q = solrquery(enriched["q"])
    print("Searching for `" + query + "` ...")
    print("  q = " + q)
    if "fq" in enriched and isinstance(enriched["fq"], str):
        fq = enriched["fq"]
        print("  fq = " + fq)
        res = solr.search(q=q,fq=fq,fl="title,overview,lat,lng",rows=3)
    else:
        print("  ...no enrichments")
        res = solr.search(q=q,fl="title,overview,release_date",rows=3)

    return res

### Let's try it with some good examples

In [1]:
printresults(search("harry potter last 5 years"))

NameError: name 'printresults' is not defined

In [11]:
printresults(search("indiana jones 1/1/1980 to 12/31/1987"))

Searching for `indiana jones 1/1/1980 to 12/31/1987` ...
  q = title_en:(indiana jones )^1.2 overview_en:(indiana jones )
  fq = release_date:[1980-01-01T00:00:00Z TO 1988-01-01T00:00:00Z]

---
Indiana Jones and the Temple of Doom
	1984-05-23T00:00:00Z
	After arriving in India, Indiana Jones is asked by a desperate village to find a mystical stone. He agrees – and stumbles upon a secret cult plotting a terrible plan in the catacombs of an ancient palace.

---
Raiders of the Lost Ark
	1981-06-12T00:00:00Z
	When Dr. Indiana Jones – the tweed-suited professor who just happens to be a celebrated archaeologist – is hired by the government to locate the legendary Ark of the Covenant, he finds himself up against the entire Nazi regime.

---
Guyana Tragedy: The Story of Jim Jones
	1980-04-15T00:00:00Z
	The story of the Peoples Temple cult led by Jim Jones and the events leading up to one of the largest mass suicides in history.


In [12]:
printresults(search("harry potter goblet"))

Searching for `harry potter goblet` ...
  q = title_en:(harry potter goblet)^1.2 overview_en:(harry potter goblet)
  ...no enrichments

---
Harry Potter and the Goblet of Fire
	2005-11-05T00:00:00Z
	Harry starts his fourth year at Hogwarts, competes in the treacherous Triwizard Tournament and faces the evil Lord Voldemort. Ron and Hermione help Harry manage the pressure – but Voldemort lurks, awaiting his chance to destroy Harry and all that he stands for.

---
Harry Potter and the Philosopher's Stone
	2001-11-16T00:00:00Z
	Harry Potter has lived under the stairs at his aunt and uncle's house his whole life. But on his 11th birthday, he learns he's a powerful wizard -- with a place waiting for him at the Hogwarts School of Witchcraft and Wizardry. As he learns to harness his newfound powers with the help of the school's kindly headmaster, Harry uncovers the truth about his parents' deaths -- and about the villain who's to blame.

---
A Very Potter Musical
	2009-04-09T00:00:00Z
	In Apri