## Google Place Search API Scrape

The purpose of this script is to generate a list of destinations in and near the city from which to choose the basket of destinations for each blockgroup. This should be a scalable, repeatable process that can be refreshed from time to time as the amenities in the city change.

This script accesses the Google Map Places API. Note there are usage limits terms of service, such as displaying a Google map at https://developers.google.com/maps/documentation/distance-matrix/usage-limits
https://developers.google.com/places/web-service/search

In [None]:
import pandas as pd
from pandas.io.json import json_normalize
import json
from datetime import datetime
import os.path
import time

try:
    from urllib.request import Request, urlopen  # Python 3
except:
    from urllib2 import Request, urlopen  # Python 2
    
import string
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)

In [None]:
API_Key = open(".\Variables\google_place_query.txt", 'r').read()

In [None]:
def searchPlaces(seachtype,location,pagetoken):

    # save add page token parameter if not a new search
    if pagetoken == "none":
        URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?&location=" + location + \
                     "&rankby=distance&types=" + searchtype + "&key=" + API_Key
    else:
        URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json?&key=" + API_Key + "&pagetoken=" + pagetoken
    
    print (URL)
    q = Request(URL)
    a = urlopen(q).read()
    data = json.loads(a)
    
    dfPlaces = json_normalize(data,['results'])
    print (dfPlaces)
    dfPlaces['lat'] = dfPlaces['geometry'].apply(lambda x: x['location']['lat'])
    dfPlaces['lng'] = dfPlaces['geometry'].apply(lambda x: x['location']['lng'])
    dfPlaces['city'] = dfPlaces['vicinity'].apply(lambda x: ''.join(x.split(', ')[-1:]))
    dfPlaces['address'] = dfPlaces['vicinity'].apply(lambda x: ''.join(x.split(', ')[-2:-1]))
    
    dfOut = dfPlaces[['name','address','city','lat','lng','place_id','types']]
    
    # Add ratings if they exist in the query
    if 'rating' in dfPlaces.columns:
        dfOut['rating'] = dfPlaces['rating']
    else:
        dfOut['rating'] = ""
        
    dfOut['class'] = seachtype
    #dfOut['blockgroup'] = blockgroup
    
    # save a new csv if first search, append if pagetoken
    if os.path.exists("V:\\Asset Management Program\\Data Science\\Data\\GoogleMatrix_Places.csv"):
        dfOut.to_csv("V:\\Asset Management Program\\Data Science\\Data\\GoogleMatrix_Places.csv", mode='a', header=False, index=False)
    else:
        dfOut.to_csv("V:\\Asset Management Program\\Data Science\\Data\\GoogleMatrix_Places.csv", mode='w', header=True, index=False)

    # Keep searching if there are more records, there is a 3 page limit
    if 'next_page_token' in data.keys():
        time.sleep(2)
        pagetoken = data['next_page_token']
        searchPlaces(searchtype,location,pagetoken)



In [None]:
# Because the google search algorithm returns places in proximity to the search location, we are
# searching at every urban village. This should cover the entire city with redundant results so 
# we are filtering out duplicates. We are also looking at the returned values and filtering out 
# results to get a clean, repeatable list.

types = ['supermarket','library','hospital','pharmacy','post_office','school','cafe','store']

df_origins = pd.read_csv('V:\\Asset Management Program\\Data Science\\Geographies\\UV_Origins.csv')
#df_origins = df_origins[df_origins['uv_origin'] == 'West Seattle Junction']

for index, row in df_origins.iterrows():
    
    location = str(row['origin_lat']) + "," + str(row['origin_lng'])
    
    for searchtype in types:
        pagetoken = "none"        
        #searchPlaces(row['BLOCKGROUP'],searchtype,location,pagetoken)
        searchPlaces(searchtype,location,pagetoken)


In [None]:
## Clean and filter search results. Unfortunately a necessary step

dfPlaces = pd.read_csv('V:\\Asset Management Program\\Data Science\\Data\\GoogleMatrix_Places.csv')

#dfPlaces = dfPlaces[dfPlaces['city'] == "Seattle"]

# drop all supermarkets with no ratings
dfPlaces = dfPlaces[(dfPlaces['class'] != "supermarket") | (dfPlaces['rating'] > 0)]

# drop all libraries that do not contain "Seattle Public Library"
dfPlaces = dfPlaces[(dfPlaces['class'] != "library") | (dfPlaces['name'].str.contains("Seattle Public Library"))]

# drop pharmacies with ratings Nan or less than two
dfPlaces = dfPlaces[(dfPlaces['class'] != "pharmacy") | (dfPlaces['rating'] > 2)]

# drop hospitals with ratings Nan or less than three
dfPlaces = dfPlaces[(dfPlaces['class'] != "hospital") | (dfPlaces['rating'] > 3)]

# drop post offices with ratings Nan
dfPlaces = dfPlaces[(dfPlaces['class'] != "post_office") | (dfPlaces['rating'] > 0 )]

# drop all schools that do not contain "Elementary, High, Middle"
dfPlaces = dfPlaces[(dfPlaces['class'] != "school") | ((dfPlaces['name'].str.contains("High")) | 
                                                        (dfPlaces['name'].str.contains("Middle")) |
                                                        (dfPlaces['name'].str.contains("Elementary")))]

# drop all stores that do not contain "supermarket", set them to class supermarket
dfPlaces = dfPlaces[(dfPlaces['class'] != "store") | (dfPlaces['types'].str.contains("supermarket"))]
dfPlaces['class'].loc[dfPlaces['class'] == 'store'] = 'supermarket'

# drop all supermarkets that contain "furniture_store" 
dfPlaces = dfPlaces[(dfPlaces['class'] != "supermarket") | (~dfPlaces['types'].str.contains("furniture_store"))]

# drop duplicate place IDs
dfPlaces = dfPlaces.drop_duplicates(subset=['place_id'], keep='first', inplace=False)

dfPlaces = dfPlaces.reset_index(drop=True)

print (dfPlaces)
dfPlaces.to_csv("V:\\Asset Management Program\\Data Science\\Data\\GoogleMatrix_Places.csv", mode='w', header=True, index=False)

