In [1]:
import spacy
import requests
import datetime
import json
nlp = spacy.load('en_core_web_lg')

### Make a request to the Mapquest service

In [2]:
#MapQuest Geocode service.  More info: https://developer.mapquest.com/documentation/geocoding-api/
mapquest_url='https://www.mapquestapi.com/geocoding/v1/address?key=FVfj6GGCXVEGUjXvokn4IbXdNAHIbbH0&location='
def geocode(text):
    req = requests.get(mapquest_url+text)
    jsn = req.json()
    loc = jsn["results"][0]["locations"][0]
    return loc

In [3]:
#us-map-with-latitude-longitude-united-states-latitude-longitude.jpg

#Disambiguate to the nearest lat/lng whole numbers:
def near(loc):
    latlng = loc["latLng"]
    return round(latlng["lat"]),round(latlng["lng"])

#Normalize to City|State|Country:
def norm(loc):
    loctypes = {"City":"","State":"","Country":""}
    location = None
    for i in range(8):
        aai = "adminArea"+str(i)
        aat = aai+"Type"
        if aat in loc.keys() and loc[aat] in loctypes.keys():
            loctypes[loc[aat]] = aai
    for typ in loctypes.keys():
        fld = loctypes[typ]
        if fld in loc.keys() and len(loc[fld]):
            if not location:
                location = loc[fld]
            else:
                location += "|" + loc[fld]
    if not location:
        location = "_UNKNOWN_"
    return location.replace(" ", "_")

### Try a couple examples

In [4]:
#Normalize to City|State|Country
print(norm(geocode("Raleigh, NC")))
print(norm(geocode("Raleigh")))
print(norm(geocode("Chapel Hill")))
print(norm(geocode("Durham")))
print(norm(geocode("RTP, NC")))
print(norm(geocode("Charlotte, NC")))
print(norm(geocode("Bangkok")))

Raleigh|NC|US
Raleigh|NC|US
Chapel_Hill|NC|US
Durham|NC|US
Durham|NC|US
Charlotte|NC|US
Bangkok|TH


In [5]:
print(near(geocode("Raleigh, NC")))
print(near(geocode("Raleigh")))
print(near(geocode("Chapel Hill")))
print(near(geocode("Durham")))
print(near(geocode("RTP, NC")))
#(36, -79)

print(near(geocode("Charlotte, NC")))
#(35, -81)

print(near(geocode("Bangkok")))

(36, -79)
(36, -79)
(36, -79)
(36, -79)
(36, -79)
(35, -81)
(14, 100)


### Entity spans with SpaCy

In [6]:
def extract_independent_locations(text):
    doc = nlp(text)
    locs = []
    #Fun fact, "GPE" means Geo-Political Entity
    for gpe in filter(lambda w: w.ent_type_ in ['GPE','LOC'], doc):
        print("Text:",text,"\t|\tLocation:",gpe)

In [7]:
extract_independent_locations("Kevin alone in NYC")
extract_independent_locations("Indiana Jones in India")
extract_independent_locations("action hero movie in LA")
extract_independent_locations("Boxing Revenge in Moscow")

Text: Kevin alone in NYC 	|	Location: NYC
Text: Indiana Jones in India 	|	Location: India
Text: action hero movie in LA 	|	Location: LA
Text: Boxing Revenge in Moscow 	|	Location: Moscow


### Subject/Object dependency examples

In [10]:
def extract_dependent_locations(text):
    #debug here:
    # https://explosion.ai/demos/displacy?text=Kevin%20McCallister%20in%20New%20York%20NY&model=en_core_web_lg&cpu=1&cph=1

    #merge entities and noun chunks into one token
    doc = nlp(text)
    spans = list(doc.ents)# + list(doc.noun_chunks)
    for span in spans:
        span.merge()

    relations = []
    for gpe in filter(lambda w: w.ent_type_ in ['GPE','LOC'], doc):
        if gpe.dep_ in ('attr', 'dobj'):
            subject = [w for w in gpe.head.lefts if w.dep_ == 'nsubj']
            if subject:
                subject = subject[0]
                relations.append((subject, gpe))
        elif gpe.dep_ == 'pobj' and gpe.head.dep_ == 'prep':
            relations.append((gpe.head.head, gpe))
        else:
            relations.append((None,gpe))

    return relations

In [11]:
print(extract_dependent_locations("Kevin McCallister in New York NY"))
print(extract_dependent_locations("Kevin alone in NYC"))
print(extract_dependent_locations("Indiana Jones in India"))
print(extract_dependent_locations("California action hero"))
print(extract_dependent_locations("boxing revenge in moscow"))
print(extract_dependent_locations("Riddick in the underverse"))

[(Kevin McCallister, New York NY)]
[(alone, NYC)]
[(Indiana Jones, India)]
[(None, California)]
[(revenge, moscow)]
[]


### State Machine GPE chunk examples

In [14]:
def extract_chunked_locations(text):
    #Use a basic finite state machine to chunk proper noun GPEs as one location
    doc = nlp(text)
    gpes = []
    for s in doc.sents:
        curr = []
        isgpe = False
        for t in s:
            if (t.pos_ == 'PROPN' and t.ent_type_ == 'GPE') or (isgpe == True and t.text==','):
                isgpe = True
                if t.text!=',':
                    curr.append(t.text)
            elif isgpe == True:
                gpes.append(' '.join(curr))
                curr = []
                isgpe = False
            else:
                isgpe = False
        if(len(curr)):
            gpes.append(' '.join(curr))
            curr = []
            isgpe = False
    return gpes

In [17]:
print(extract_chunked_locations("Kevin McCallister in New York NY"))
print(extract_chunked_locations("Kevin alone in NYC"))
print(extract_chunked_locations("Indiana Jones in India"))
print(extract_chunked_locations("California action hero"))
print(extract_chunked_locations("boxing revenge in moscow"))
print(extract_chunked_locations("boxing revenge in Moscow"))
print(extract_chunked_locations("Riddick in the underverse"))

['New York NY']
['NYC']
['India']
['California']
[]
['Moscow']
[]


### Enrich a query with a location using SpaCy and Mapquest

In [19]:
def enrich_query_location(q):
    enrichment = {"q":q}
    entities = extract_dependent_locations(q)
    if len(entities) and len(entities[0])==2:
        #found a location - look it up in mapquest and disambiguate
        entity = entities[0][1].text
        latlng = near(geocode(entity))
        if latlng and len(latlng)==2:
            # It's a valid location! 
            # ...add the filter query and remove the text from the query:
            fq = "lat:"+str(latlng[0]) + " lng:"+str(latlng[1])
            enrichment["fq"] = fq
            enrichment["q"] = q.replace(entity,"")
    return enrichment
enrich_query_location("Kevin alone in NYC")

{'q': 'Kevin alone in ', 'fq': 'lat:41 lng:-74'}