This notebook is used to collect google places API results based on coordinates and write out a resulting json file 'taxi_blocks_google_places_all.txt'. It will also create another csv file as an output which is a summary of the count of the type of places that are available associated to each block_id.  This notebook requires libraries from the below cell and the summarized dataset 'taxi_blocks.csv' be available in the working directory. The dataset contains coordinates of the most frequently utilized coordinates in the NYC taxi dataset, 'taxi_blocks.csv' which is required to run this script. This script is in a notebook format since the API occasionally times out and is easiest to restart to get the complete data.

The API returns the results in JSON and is ordered by prominence.

In [1]:
import pandas as pd
import json
import math
from googleplaces import GooglePlaces, types, lang ##pip install python-google-places

In [2]:
def coord_distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = int(radius * c * 1000)/1000.0

    return d #in km

In [31]:
##Make sure the file is available
data = pd.read_csv('taxi_blocks.csv')
##These types are used for the API call
types_to_query = [['point_of_interest'], ['restaurant'], ['bar'], ['store'], ['lodging'], ['subway_station']]

##Obtain API key from Google Places
API_KEY = ''
google_places = GooglePlaces(API_KEY)

##Initiate dictionary
google_places_result = {}

## Use below cell if it times out.

In [59]:
##If the API times out check to see what the last block_id number was and then restart by executing the below cell.
data = data[data['block_id'] > 485]
data.index=range(data.shape[0])

In [62]:
##Loop through each of the block_id coordinates
for index, row in data.iterrows():
    block_id = int(data.iloc[index]['block_id']) 
    google_places_result[block_id] = {} ##initialize dictionary with block_id within parent dict
    #google_places_result[block_id]['lat'] = data.iloc[index]['latitude']
    #google_places_result[block_id]['lng'] = data.iloc[index]['longitude']
    lon_lat = {'lat': data.iloc[index]['latitude'], 'lng': data.iloc[index]['longitude']} ##input for API call
    for a in types_to_query: ##start loop for the various types declared above
        query_result = google_places.nearby_search(lat_lng = lon_lat, radius = 500, types = a) ##API call 
        for place in query_result.places: ##start extraction of JSON results
            if place.place_id not in google_places_result[block_id]: ##check if unique place id is already included 
                ##insert data name, types and distance for unique place id
                google_places_result[block_id][place.place_id] = {'name': place.name, 'type': place.types, 
                    'dist': coord_distance((float(place.geo_location['lat']), 
                                            float(place.geo_location['lng'])), 
                                           (data.iloc[index]['latitude'], data.iloc[index]['longitude']))}
                place.get_details() ##additional API call to retrieve more info
                if 'opening_hours' in place.details.keys(): ##insert opening_hours if available
                    google_places_result[block_id][place.place_id]['opening_hours'] = place.details['opening_hours']['weekday_text']
                if 'address_components' in place.details.keys(): ##insert neighborhood if available, else None
                    google_places_result[block_id][place.place_id]['neighborhood'] = next(
                        (item['short_name'] for item in place.details['address_components'] 
                                 if item["types"] == ['neighborhood', 'political']), None)
    ##One last call without types
    query_result = google_places.nearby_search(lat_lng = lon_lat, radius = 500)
    for place in query_result.places:
        if place.place_id not in google_places_result[block_id]:
            google_places_result[block_id][place.place_id] = {'name': place.name, 'type': place.types, 
                'dist': coord_distance((float(place.geo_location['lat']), 
                                        float(place.geo_location['lng'])), 
                                        (data.iloc[index]['latitude'], data.iloc[index]['longitude']))}
            place.get_details()
            if 'opening_hours' in place.details.keys():
                google_places_result[block_id][place.place_id]['opening_hours'] = place.details['opening_hours']['weekday_text']
            if 'address_components' in place.details.keys():
                google_places_result[block_id][place.place_id]['neighborhood'] = next(
                    (item['short_name'] 
                    for item in place.details['address_components'] 
                        if item["types"] == ['neighborhood', 'political']), None)

##Output JSON file
google_places_result_json = {str(k):v for k,v in google_places_result.items()}
google_places_result_json = json.dumps(google_places_result_json, ensure_ascii=False)
with open('taxi_blocks_google_places_all.txt', 'w') as outfile:
    json.dump(google_places_result, outfile, indent=2)

### If you already have the file taxi_blocks_google_places_all.txt start here.

In [2]:
##Load data if necessary
with open('taxi_blocks_google_places_all.txt') as json_data:
    google_places_result = json.load(json_data)

In [4]:
##This will output a file that includes count of the various types per block_id
block_id_type = {} ##type dictionary
type_list = [] ##keeps tab of all the types
ignore_type = ['locality', 'sublocality_level_1', 'point_of_interest', 'sublocality', 'political', 'establishment'] ##ignore these
for key_init, values_dict in google_places_result.iteritems():
    block_id_type[key_init] = {} ##initialize dict for block_id
    for key, values in values_dict.iteritems():
        ##find first index which is not in the ignore type
        index_not_ignore_type = next((values['type'].index(item) for item in values['type'] if item not in ignore_type), 0)
        if values['type'][index_not_ignore_type] not in block_id_type[key_init]: ##initialize if first one
            block_id_type[key_init][values['type'][index_not_ignore_type]] = 1 
            if values['type'][index_not_ignore_type] not in type_list: ##append to type list if first
                type_list.append(values['type'][index_not_ignore_type])
        else: ##add one when type appears again
            block_id_type[key_init][values['type'][index_not_ignore_type]] += 1
block_id_type_pd = pd.DataFrame(index = block_id_type.keys(), columns=type_list) ##initialize pandas dataframe

for key, value in block_id_type.iteritems(): ##update the value column with block_id key
    block_id_type_pd.loc[key] = pd.Series(value)

In [5]:
block_id_type_pd.to_csv('place_type_count_by_blockid.csv')