# FEMA Lifeline

### Google places nearby EDA

Gathering google data via Google Places_Nearby API and Google Elevation API

By: John Wertz

In [20]:
#Import libraries
import pandas as pd
import numpy as np

import googlemaps
import requests
import time
import ast
import math

apikey = 'ZIzaSyBZGKnHrgP7yNXmzxy8'

In [2]:
#Instantiate google maps 
#gmaps = googlemaps.Client(key=apikey)

#gmaps.places_nearby(location = loc,radius=rad, type=place_types,page_token=None) # Template
#This idea came from (https://github.com/meldev00/FEMA_disaster_tool)

# List Definition Sections


In [3]:
# Place Type that are returned in places_nearby API. Complete list

google_place_type_list = ['accounting', 'airport', 'amusement_park', 'aquarium', 'art_gallery', 'atm', 'bakery',
                          'bank', 'bar', 'beauty_salon', 'bicycle_store', 'book_store', 'bowling_alley',
                          'bus_station', 'cafe', 'campground', 'car_dealer', 'car_rental', 'car_repair', 
                          'car_wash', 'casino', 'cemetery', 'church', 'city_hall', 'clothing_store', 
                          'convenience_store', 'courthouse', 'dentist', 'department_store', 'doctor', 'drugstore', 
                          'electrician', 'electronics_store', 'embassy', 'fire_station', 'florist', 'funeral_home',
                          'furniture_store', 'gas_station', 'grocery_or_supermarket', 'gym', 'hair_care', 
                          'hardware_store', 'hindu_temple', 'home_goods_store', 'hospital', 'insurance_agency', 
                          'jewelry_store', 'laundry', 'lawyer', 'library', 'light_rail_station', 'liquor_store', 
                          'local_government_office', 'locksmith', 'lodging', 'meal_delivery', 'meal_takeaway', 
                          'mosque', 'movie_rental', 'movie_theater', 'moving_company', 'museum', 'night_club', 
                          'painter', 'park', 'parking', 'pet_store', 'pharmacy', 'physiotherapist', 'plumber', 
                          'police', 'post_office', 'primary_school', 'real_estate_agency', 'restaurant', 
                          'roofing_contractor', 'rv_park', 'school', 'secondary_school', 'shoe_store', 
                          'shopping_mall', 'spa', 'stadium', 'storage', 'store', 'subway_station', 'supermarket', 
                          'synagogue', 'taxi_stand', 'tourist_attraction', 'train_station', 'transit_station', 
                          'travel_agency', 'university', 'veterinary_care', 'zoo'
                          ]


In [5]:
# list of type to remove from the types data
#    note: this data is not found in the google master list
exclude_list = ['point_of_interest',
                'establishment'
               ]

In [6]:
# get the dictionary: fema_lifeline_dictionary from the data file

with open("../data/fema_lifeline_dictionary.txt","r") as data:
    fema_lifeline_dictionary = ast.literal_eval(data.read())


In [7]:
# get the list: place_type_list

with open("../data/place_type_list.txt","r") as data:
    place_type_list = ast.literal_eval(data.read())


# Function Sections

In [8]:
# Function: get_place_nearby

# Paramters to pass:
#      user_location ( a string that contains lat, long geo location as '39.7392, -104.9903'
#      user_radius  ( in meters: 5 miles = 8,046.72)
#      place_type_list ( a list of google defined place type to retrieve on)
#      user_apikey. ( the all important api key info *top secret*)

def get_place_nearby (user_location, user_radius, place_type_list, user_apikey):
    
    # Instantiate google maps 
    gmaps = googlemaps.Client(key=user_apikey)
    
    # Initialize 
    saved_results = pd.DataFrame()
    ndx_count = 0
    ndx_call = 0
    ndx_scale = 1
    columns_to_drop_list = ['icon', 'id',  'plus_code', 'opening_hours', 'photos', 'place_id', 'rating', 'reference', 
                            'scope', 'user_ratings_total', 'price_level']
    
    # loop through place_type_list and perform place_nearby call with the type desscrition in params
    for type_description in place_type_list:

        places_nearby_info = gmaps.places_nearby(location = user_location, 
                                                 radius = user_radius, 
                                                 type = type_description)
        
        # get count of results info (rows) and increment count of call
        ndx_call += 1
        ndx_count = ndx_count + len(places_nearby_info['results'])
        
        # create data frame to store info
        results = pd.DataFrame(places_nearby_info['results'])
        
        # remove info that we will not need from call results
        # first need to find if column exist, before dropping. Prevents errors.
        columns_to_drop = []
        for x in columns_to_drop_list:
            if x in results.columns:
                # add to drop list
                columns_to_drop.append(x)

        results.drop(columns=columns_to_drop, inplace = True)

        # add results to saved_results and loop back to get next data 
        saved_results = saved_results.append(results, ignore_index = True, sort=False)
        
        # print out message for results count
        if ndx_count > (100 * ndx_scale):
            print(f'{ndx_call} calls and {ndx_count} rows of data collected')
            ndx_scale = int(round(ndx_count/100, 0)) + 1
            
        # to prevent overloading the call to google places_nearby
        time.sleep(3)
        
        
    # Save info in a data file
    print(f'{ndx_count} rows of data collected')
    
    return saved_results

In [9]:
# Function get_distance

# Haversine formula for finding the distance between two lat, long. This code follows the example from Wayne Dyck.
#   Link: https://gist.github.com/rochacbruno/2883505 

def get_distance(lat_orgin, long_orgin, lat_destination, long_destination):

    radius = 6378  #The radius of Earth in Kilometers is 6378
    
    length_lat = math.radians(lat_destination - lat_orgin)
    length_long = math.radians(long_destination - long_orgin)
    
    a = (math.sin(length_lat/2))**2 + math.cos(math.radians(lat_orgin)) * \
            math.cos(math.radians(lat_destination)) * math.sin(length_long/2)**2
    
    distance = radius * (2 * math.atan2(math.sqrt(a), math.sqrt(1-a)))
    
    return round(distance, 4)
    

In [10]:
# Function: assign_lifeline

def assign_lifeline (types, list_data, number):
#     # convert types data from string into a list
#     types = types[1:len(types) - 1].replace("'", "").split(", ")
    
    for x in types:
        if x in list_data:
            return number
    return 0

In [11]:
# Function: cleanup_types

def cleanup_types (types, exclude_list):
    # convert types data from string into a list
    types = types[1:len(types) - 1].replace("'", "").split(", ")
    
    # remove unnecessary type data
    for x in exclude_list:
        if x in types:
            types.remove(x)
    
    return types

In [12]:
# Function: set_duplicate_flag

# check the duplicate list with ndx. if found return 'D' flag for duplicate found, 'S' for single.
# geometry data ==> ndx
# df_dup ===> duplicate_list    a subset of data that contain duplicate rows based on geo info   

def set_duplicate_flag(ndx, duplicate_list):

    if ndx in duplicate_list:
        return 'D'
    else:
        return 'S'
            

In [13]:
# Function: analyze_duplicate

# This function will evaluate the duplicate rows and choose which one to keep by the hierachy of the
#     type information and lifeline number: 
# Will return the duplicate_flag key as follows:
#    's' : single
#    'r' : remove
#
# Hierachy:
#    1) 'gas_station' over all  ==> 4
#    2) 'drugstore' over 'food' ==> 3
#    3) 'hospital' over 'airport' ==> 3    
#    4) 'hospital' over 'local_government_office' ==> 3  
#    5) 'hospital' over 'university' ==> 3  
#    6) 'hardware_store' over 'car_dealer' ==> 2  
#    7) 'restaurant' over 'car_repair' ==> 2  
#
# pass in row_types         ==> types info the df_data
# pass in row_lifeline_ndx  ==> lifeline_number the df_data
# pass in df                ==> df_data

def analyze_duplicate(row):

    if row['duplicate_flag'] =='D':
        
        types = row['types']
#        print(type(types))
#        print(types)
#        types = types[1:len(types) - 1].replace("'", "").split(", ")        
        
        # Hierachy 1
        if 'gas_station' in types and row['lifeline_number'] == 4:      
            return 'S'
        # Hierachy 2
        elif 'drugstore' in types and 'food' in types and row['lifeline_number'] == 3:       
            return 'S'
        # Hierachy 3
        elif 'hospital' in types and 'airport' in types and row['lifeline_number'] == 3:                  
            return 'S'
        # Hierachy 4
        elif 'hospital' in types and 'local_government_office' in types and row['lifeline_number'] == 3:
            return 'S'
        # Hierachy 5
        elif 'hospital' in types and 'university' in types and row['lifeline_number'] == 3:     
            return 'S'
        # Hierachy 6
        elif 'hardware_store' in types and 'car_dealer' in types and row['lifeline_number'] == 2:     
            return 'S'
        # Hierachy 7
        elif 'restaurant' in types and 'car_repair' in types and row['lifeline_number'] == 2:     
            return 'S'
        else:
            return 'R'

    else:
        return 'S'


In [14]:
# Function: double_check_duplicate_keys

# function to double check to make sure all duplicates have been addressed
#     funtionality: 
# df_check ==> df    contains subset of data where duplicate flag set and stay flag set 
# df_data  ==> row   row from data for evaluation
#
# Get the row's geo info when the row flag has duplicate flag set and removal flag set.
# Find matching geo info in df. 
#    If found, set checked flag to stay. 
#    If not found, set checked flag to missed.
# All else set checked flag to stay.

def double_check_duplicate_keys(row, df):

    if row['duplicate_flag'] =='D' and row['removal_flag'] == 'R':
        geo = row['geometry']
        for x in df['geometry']: 
            if geo == x:
                return 'S'
        return 'M'
    else:
        return 'S'


In [15]:
# Function: combine_lat_lng_info

def combine_lat_lng_info (row):
    
    lat_lng_list = []
    
    lat_lng_list.append(row['latitude'])
    lat_lng_list.append(row['longitude'])
    
    return lat_lng_list
        

In [16]:
# Function: get_elevation_data

def get_elevation_data (row, user_apikey):
    
        gmaps = googlemaps.Client(key=user_apikey)
    
        lat_lng_data = combine_lat_lng_info (row)

        result = gmaps.elevation(locations=[lat_lng_data])
        
        return int(result[0]['elevation'])

# Error supression

In [17]:
# I keep getting warning messages, which would tie up my eda cleaning processing. 
#     This would stop the warning, but this should not be used on a regualer bassis. Not a good practice.

import warnings
warnings.filterwarnings('ignore')

# Gathering Data via Google places_nearby API

In [18]:
user_location = '39.7392358, -104.990251' # Denver geo location lat:39.7392358  long:-104.990251
user_radius = 50000                       # In meters: 5 miles = 8,046.72ft (use 1000 for test purpose) Max: 50,000
                         

In [21]:
saved_results = get_place_nearby (user_location, user_radius, place_type_list, apikey)

6 calls and 120 rows of data collected
11 calls and 220 rows of data collected
16 calls and 320 rows of data collected
21 calls and 401 rows of data collected
27 calls and 509 rows of data collected
32 calls and 608 rows of data collected
37 calls and 708 rows of data collected
42 calls and 808 rows of data collected
49 calls and 905 rows of data collected
945 rows of data collected


In [22]:
# Save the results: Note use date at end of file name...

saved_results.to_csv('../data/raw_data/places_nearby_results_rawdata_denver_co_20200220.csv', index = False)

# EDA: Build FEMA Lifeline Database

Processing Step:

- Remove duplicates and reset index
- Extract latitude and longitude info from geometry column and create latitude and longitude columns
- Clean up types data. Remove unnessary data and cast data from string to list
- Clean up plus_type data. Cast data from string to dict
- Assign lifeline and category to data
- Find and remove duplicate data across database
- Call Google Elevation API to get elevation data for each buisnesses
- Save database as clean data


In [28]:
# retrieve data from '../data/raw_data/places_nearby_results_rawdata_denver_co_20200220.csv' file
df_place = pd.read_csv('../data/raw_data/places_nearby_results_rawdata_denver_co_20200220.csv')

In [29]:
# remove duplicates and reset index
df_place.sort_values('geometry', inplace = True)
df_place.drop_duplicates(subset = 'geometry', keep = 'first', inplace = True) 
df_place.reset_index(drop = True, inplace = True)

In [30]:
# split up the geometry columns into latitude and longitude calculate the distance from point of orgin
        
user_location_lat = float(user_location.split(",")[0])
user_location_lng = float(user_location.split(",")[1])

df_place['latitude'] = df_place['geometry'].apply(lambda x: ast.literal_eval(x)['location']['lat'])
df_place['longitude'] = df_place['geometry'].apply(lambda x: ast.literal_eval(x)['location']['lng'])

df_place['distance'] = df_place['geometry'].apply(lambda x: get_distance(user_location_lat, 
                                                                         user_location_lng,
                                                                         ast.literal_eval(x)['location']['lat'], 
                                                                         ast.literal_eval(x)['location']['lng']))


In [31]:
# clean up the types data
df_place['types'] = df_place['types'].apply(lambda x: cleanup_types(x, exclude_list))


### Assign lifeline number and category section:

In [32]:
# Assign lifeline number and category to the data

ndx = 1
df_data = pd.DataFrame()
df_sub = pd.DataFrame()

while ndx <= len(fema_lifeline_dictionary):
    for key, value in fema_lifeline_dictionary[ndx].items():
        df_place['lifeline_number'] = df_place['types'].apply(lambda x: assign_lifeline(x, value, ndx))
        
        df_sub = df_place[df_place['lifeline_number'] == ndx]
        df_sub['lifeline_category'] = key
        
        df_data = df_data.append(df_sub, sort=False, ignore_index=True)
        
        # clean_up
        df_place.drop(columns='lifeline_number', inplace=True)
        
        ndx += 1        

In [33]:
# save this
df_data.to_csv('../data/raw_data/data_with_lifeline_info_rawdata_20200220.csv', index = False)

# EDA: Duplicate rows

The following routine will identify duplicate rows and remove them

In [36]:
# retrive data from '../data/raw_data/data_with_lifeline_info_rawdata_20200220.csv' file
df_data = pd.read_csv('../data/raw_data/data_with_lifeline_info_rawdata_20200220.csv')

In [37]:
# get subset of duplicate rows to pass to set_duplicate_flag
df_dup = df_data[df_data.duplicated(subset='geometry', keep='first')]

# set the duplicate flags 
df_data['duplicate_flag'] = df_data['geometry'].apply(lambda x: set_duplicate_flag(x, df_dup['geometry'].tolist()))

# analyze the duplicate rows
df_data['removal_flag'] = df_data.apply(lambda x: analyze_duplicate(x), axis = 1)

# capture a subset of data where the rows have been marked duplicate and removal set to 'S'
df_check = df_data[(df_data['duplicate_flag'] == 'D') & (df_data['removal_flag'] == 'S')]

# call function to double check that all data with duplicate have been identified
df_data['check_flag'] = df_data.apply(lambda x: double_check_duplicate_keys(x, df_check), axis = 1)

In [38]:
# Final check for any duplicate row marked 'M'; missed
df_data[df_data['check_flag'] == 'M']

Unnamed: 0,geometry,name,types,vicinity,latitude,longitude,distance,lifeline_number,lifeline_category,duplicate_flag,removal_flag,check_flag


In [39]:
df_missed = df_data[df_data['check_flag'] == 'M']

In [40]:
df_missed.reset_index(drop = True, inplace = True)

In [41]:
df_missed.head()

Unnamed: 0,geometry,name,types,vicinity,latitude,longitude,distance,lifeline_number,lifeline_category,duplicate_flag,removal_flag,check_flag


# EDA: Removal of duplicate rows

If no rows are marked with 'M', it is safe to remove the duplicate rows flaged for removal.


In [43]:
# remove the duplicate rows across the lifelines
df_data = df_data[df_data['removal_flag'] != 'R']

# reset the index
df_data.reset_index(drop = True, inplace = True)

# drop the duplicate's temp columns:'duplicate_flag', 'removal_flag', 'check_flag'
df_data.drop(columns=['duplicate_flag', 'removal_flag', 'check_flag'], inplace = True)

# Get Geo info: Altitude/Elevation

In [44]:
df_data['elevation'] = df_data.apply(lambda row: get_elevation_data (row, apikey), axis = 1)

In [45]:
df_data = df_data[['geometry', 'latitude', 'longitude', 'distance', 'elevation', 'name', 'types', 
                  'vicinity', 'lifeline_number', 'lifeline_category' ]]


In [46]:
# save data frame, cleaned.
df_data.to_csv('../data/clean_data/fema_lifeline_info_clean_20200220.csv', index = False)


In [48]:
df_data.shape

(814, 10)