In [None]:
import pandas as pd
import numpy as np

import json

import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

import credentials
import requests
import traceback

from os import mkdir
from os import listdir

from datetime import datetime
from time import sleep

#from selenium import webdriver
#from bs4 import BeautifulSoup as bs

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn import preprocessing

from geopy import Point
from geopy.geocoders import ArcGIS
from geopy.distance import geodesic
from geopy.distance import VincentyDistance

from IPython.core.display import display, HTML

In [None]:
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
CLIENT_ID = credentials.credentials['CLIENT_ID'] 
CLIENT_SECRET = credentials.credentials['CLIENT_SECRET']
ACCESS_TOKEN = credentials.credentials['ACCESS_TOKEN']
#VERSION = '20180604'
VERSION = '20200527'

<h3>Import and clean data from census.gov</h3>

I Used this video to help get demographic data from census.gov :https://www.youtube.com/watch?v=K0-ifZS0mQI.

Since the columns were akwardly labeled, and I was unfamiliar with the contents of the file, made sense to do a lot of the cleaning manually in MS Excel.

The CSV file contains socio-economic data from 519 metro/micropolitan areas in the United States.

In [None]:
metro_df = pd.read_csv('DP03_All_Metros.csv')
metro_df.head()

<h5>Make sure all Series are correct data type</h5>

In [None]:
metro_df.dtypes

<h5>The API returns all Null values as -999999999.0. Convert them to the label average.</h5>

In [None]:
metro_df[metro_df['geographic area name'] == 'Fort Payne']

In [None]:
metro_df.replace(-999999999.0, np.nan, inplace = True)

In [None]:
metro_df[metro_df['geographic area name'] == 'Fort Payne']

In [None]:
for column in metro_df.columns[4:]:
    metro_df[column].replace(np.nan, metro_df[column].mean(), inplace = True)

In [None]:
metro_df[metro_df['geographic area name'] == 'Fort Payne']

In [None]:
for column in metro_df:
    print((metro_df[column] == -999999999.0).value_counts())

<h3>Add coordinates to metro data</h3>

In [None]:
metro_df[['geographic area name', 'state']].head()

In [None]:
geolocator = ArcGIS(user_agent = 'IBM_DS_Capstone')

In [None]:
def locateMetroCities(cities, states):
    '''
    cities: str, name of metro. Eg: "Seattle-Tacoma-Bellevue", or "Idaho Falls"
    states: str, state codes. Eg: "ID-WA", or "OR"
    '''
    
    city_list = cities.split('-')
    state_list = states.split('-')
    
    lat_long_list = []
    
    for city in city_list:
        
        location = geolocator.geocode(city + ', ' + state_list[0])
        
        #makes sure we have the right city in the right state
        if (location == None) or (location.address.split(', ')[0] != city):
            
            i = 1
            while (i < len(state_list)) and (location == None):
                
                location = geolocator.geocode(city + ', ' + state_list[i])
                i += 1
        
        lat_long_list.append([location.latitude, location.longitude])
    
    return lat_long_list

<h5>Getting all lat-long data often times-out, collect data in batches. Remember loc is inclusive</h5>

In [None]:
lat_long = metro_df.loc[0:74].apply(lambda x: locateMetroCities(x['geographic area name'], x['state']), axis = 1)

In [None]:
lat_long2 = metro_df.loc[75:148].apply(lambda x: locateMetroCities(x['geographic area name'], x['state']), axis = 1)

In [None]:
lat_long3 = metro_df.loc[149:222].apply(lambda x: locateMetroCities(x['geographic area name'], x['state']), axis = 1)

In [None]:
lat_long4 = metro_df.loc[223:296].apply(lambda x: locateMetroCities(x['geographic area name'], x['state']), axis = 1)

In [None]:
lat_long5 = metro_df.loc[297:370].apply(lambda x: locateMetroCities(x['geographic area name'], x['state']), axis = 1)

In [None]:
lat_long6 = metro_df.loc[371:444].apply(lambda x: locateMetroCities(x['geographic area name'], x['state']), axis = 1)

In [None]:
lat_long7 = metro_df.loc[445:519].apply(lambda x: locateMetroCities(x['geographic area name'], x['state']), axis = 1)

In [None]:
print(len(lat_long), len(lat_long2), len(lat_long3), len(lat_long4), len(lat_long5), len(lat_long6), len(lat_long7))

<h5>Save lat_long as csv</h5>

In [None]:
lat_long = pd.concat([lat_long, lat_long2, lat_long3, lat_long4, lat_long5, lat_long6, lat_long7])

In [None]:
lat_long.to_csv('lat_long.csv', header = False, index = False)

<h5>Load lat_long from csv</h5>

In [None]:
lat_long = pd.read_csv('lat_long.csv', header = None)
lat_long.head()

In [None]:
metro_df.insert(5, 'lat_long', lat_long)
metro_df.head()

In [None]:
def lat_long_str_to_list_of_floats(lat_long_str):
    
    lat_long_str = lat_long_str.replace('[', '').replace(']', '')    
    lat_long_list = lat_long_str.split(',')
    
    lat_long_pairs_count =  int(len(lat_long_list) / 2)    
    lat_long_pairs = [[] for i in range(lat_long_pairs_count)]
    
    count = 0
    for pair in lat_long_pairs:
        
        pair.append(float(lat_long_list[0 + count]))
        pair.append(float(lat_long_list[1 + count]))
        count += 2
        
    return lat_long_pairs

In [None]:
metro_df['lat_long'] = pd.Series(map(lat_long_str_to_list_of_floats, metro_df['lat_long']))

<h3>Pick a region of interest</h3>

<h5>Pacific Northwest (WA, OR, ID). Needed the plain WA, OR, ID state codes AND the wildcards to get all the relevant metros</h5>

In [None]:
PNW_metro_df = metro_df[(metro_df['state'] == 'WA') | (metro_df['state'] == 'OR') | (metro_df['state'] == 'ID') | (metro_df['state'].str.contains('WA')) == True | (metro_df['state'].str.contains('OR')) | (metro_df['state'].str.contains('ID'))]
PNW_metro_df.reset_index(drop = True, inplace = True)
PNW_metro_df.head()

<h3>Get restaurant type frequescies for each metro</h3>

<h5>Get category JSON from Foursquare</h5>

In [None]:
url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET,
                VERSION)

categories = requests.get(url).json()

food_category_dict = categories['response']['categories'][3]

<h5>Get all 'id' values from returned JSON</h5>

In [None]:
def gen_dict_extract(key, var):
    '''
    https://stackoverflow.com/questions/9807634/find-all-occurrences-of-a-key-in-nested-dictionaries-and-lists
    Title: Find all occurrences of a key in nested dictionaries and lists
    by user: hexerei software
    28 MAY 2020
    
    Pulls all values with a specific key from a nested dictionary
    'var' is the dict
    
    '''
    if hasattr(var,'items'):
        for k, v in var.items():
            if k == key:
                yield v
            if isinstance(v, dict):
                for result in gen_dict_extract(key, v):
                    yield result
            elif isinstance(v, list):
                for d in v:
                    for result in gen_dict_extract(key, d):
                        yield result

In [None]:
id_list = list(gen_dict_extract('id', food_category_dict))
name_list = list(gen_dict_extract('name', food_category_dict))

#remove generic 'Food' category
id_list = id_list[1:]
name_list = name_list[1:]

In [None]:
name_id_dict = {}
id_name_dict = {}
for i in range(0,len(name_list)):
    name_id_dict[name_list[i]] = id_list[i]
    id_name_dict[id_list[i]] = name_list[i]

In [None]:
def getRestaurantCoordsAndType(lat_long_list, category_list, limit = 50, radius = 25000, redo50 = False):

    allVenues_df = pd.DataFrame(columns = ['name', 'category', 'category_id', 'id', 'lat', 'long'])
    
    for lat_long in lat_long_list:
        
        for category in category_list:

            lat = lat_long[0]
            long = lat_long[1]

            url = 'https://api.foursquare.com/v2/venues/explore?&openNow=0&time=any&sortByPopularity=1&categoryId=' + category + '&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
                        CLIENT_ID, 
                        CLIENT_SECRET, 
                        VERSION, 
                        lat, 
                        long, 
                        radius, 
                        limit)
            
            result = requests.get(url).json()
            
            #Keep relevant data only
            try:
                #Foursquare will only return a max of 50 venues.
                #If the max is reached, this section splits the initial radial search into several smaller searches
                if 'groups' in result['response'] and len(result['response']['groups'][0]['items']) == 50 and redo50 == False:
                    
                    lat_long_expanded_list = []
                    degree_list = [0, 45, 90, 135, 180, 225, 270, 315]                    
                    for degree in degree_list:
                        if degree % 90 == 0:
                            distance = radius/2
                        else:
                            distance = radius * (2/5)
                        origin = Point(lat, long)
                        lat_long_expanded_list.append(list(geodesic(meters = distance).destination(origin, degree))[0:2])
                    
                    allVenues_df = pd.concat([allVenues_df, getRestaurantCoordsAndType(lat_long_expanded_list, [category], limit = 50, radius = radius/2, redo50 = True)], sort = False)
                    
                elif 'groups' in result['response'] and len(result['response']['groups'][0]['items']) > 0:
                    
                    venues_df = pd.DataFrame(list(gen_dict_extract('venue', result['response']['groups'][0])))
                    
                    venues_df.drop(venues_df.loc[venues_df['categories'].map(lambda x: len(x)== 0)].index, inplace = True)                    
                    venues_df['category'] = venues_df['categories'].map(lambda x: x[0]['name'])
                    venues_df['category_id'] = venues_df['categories'].map(lambda x:x[0]['id'])
                    
                    venues_df['lat'] = venues_df['location'].map(lambda x: x['lat'])
                    venues_df['long'] = venues_df['location'].map(lambda x: x['lng'])
                    
                    venues_df.drop(['categories','location', 'photos'], axis = 1, inplace = True)

                    if 'venuePage' in venues_df:
                        venues_df.drop(['venuePage'], axis = 1, inplace = True)
                    if 'delivery' in venues_df:
                        venues_df.drop(['delivery'], axis = 1, inplace = True)
                    if 'events' in venues_df:
                        venues_df.drop(['events'], axis = 1, inplace = True)
                    
                    allVenues_df = pd.concat([allVenues_df, venues_df], sort = False)
                    
            except KeyError as err:

                print('KeyError for cat: ' + category + ' at ' + str(lat) + ', ' + str(long))
                traceback.print_exc()
                
                continue
                
            except IndexError as err:
                
                print('IndexError for cat: ' + category + ' at ' + str(lat) + ', ' + str(long))
                traceback.print_exc()
            
            except TypeError as err:
                
                print('TypeError for cat: ' + category + ' at ' + str(lat) + ', ' + str(long))
                traceback.print_exc()
    
    allVenues_df.drop_duplicates(subset = 'id', keep = 'last', inplace = True)
    
    allVenues_df = allVenues_df[allVenues_df['category'] != 'Gas Station']
    allVenues_df = allVenues_df[allVenues_df['category'] != 'Grocery Store']
    allVenues_df = allVenues_df[allVenues_df['category'] != 'Food Court']
    allVenues_df = allVenues_df[allVenues_df['category'] != 'Food']
    #allVenues_df.drop(allVenues_df.loc[allVenues_df['category'] == 'Fast Food Restaurant'].index, inplace = True)
    allVenues_df = allVenues_df[allVenues_df['category'] != 'Fast Food Restaurant']
    allVenues_df = allVenues_df[allVenues_df['category'] != 'Coffee Shop']
    allVenues_df = allVenues_df[allVenues_df['category'] != 'Shopping Plaza']
    
    if redo50 == False:
        allVenues_df.rename(columns = {'id' : 'venue_id'}, inplace = True)
    allVenues_df.reset_index(drop = True, inplace = True)
    
    return allVenues_df

<h5>Foursquare only allows 5000 API calls an hour, so batch the calls and delay between batches</h5>

In [None]:
metro_venue_dict = {}
metro_venue_dict2 = {}
metro_venue_dict3 = {}
metro_venue_dict4 = {}
metro_venue_dict5 = {}

print('Starting batch 1 at ' + datetime.now().strftime("%H:%M:%S") + '\n')
for i in PNW_metro_df.loc[0 : 4].index:    
    metro_venue_dict[PNW_metro_df['geographic area name'][i]] = getRestaurantCoordsAndType(PNW_metro_df['lat_long'][i], id_list)

print('Batch complete. 45min cooldown started at ' + datetime.now().strftime("%H:%M:%S") + '\n\n')
sleep(60*45)

print('Starting batch 2 at ' + datetime.now().strftime("%H:%M:%S") + '\n')
for i in PNW_metro_df.loc[5 : 9].index:    
    metro_venue_dict2[PNW_metro_df['geographic area name'][i]] = getRestaurantCoordsAndType(PNW_metro_df['lat_long'][i], id_list)

print('Batch complete. 45min cooldown started at ' + datetime.now().strftime("%H:%M:%S") + '\n\n')
sleep(60*45)

print('Starting batch 3 at ' + datetime.now().strftime("%H:%M:%S") + '\n')
for i in PNW_metro_df.loc[10 : 14].index:    
    metro_venue_dict3[PNW_metro_df['geographic area name'][i]] = getRestaurantCoordsAndType(PNW_metro_df['lat_long'][i], id_list)

print('Batch complete. 45min cooldown started at ' + datetime.now().strftime("%H:%M:%S") + '\n\n')
sleep(60*45)

print('Starting batch 4 at ' + datetime.now().strftime("%H:%M:%S") + '\n')
for i in PNW_metro_df.loc[15 : 19].index:    
    metro_venue_dict4[PNW_metro_df['geographic area name'][i]] = getRestaurantCoordsAndType(PNW_metro_df['lat_long'][i], id_list)

print('Batch complete. 45min cooldown started at ' + datetime.now().strftime("%H:%M:%S") + '\n\n')
sleep(60*45)

print('Starting batch 5 at ' + datetime.now().strftime("%H:%M:%S") + '\n')
for i in PNW_metro_df.loc[20 : 24].index:    
    metro_venue_dict5[PNW_metro_df['geographic area name'][i]] = getRestaurantCoordsAndType(PNW_metro_df['lat_long'][i], id_list)

print('Batch complete. 45min cooldown started at ' + datetime.now().strftime("%H:%M:%S") + '\n\n')
sleep(60*45)

print('Starting batch 6 at ' + datetime.now().strftime("%H:%M:%S") + '\n')
for i in PNW_metro_df.loc[25 : 29].index:    
    metro_venue_dict6[PNW_metro_df['geographic area name'][i]] = getRestaurantCoordsAndType(PNW_metro_df['lat_long'][i], id_list)

print('Batch complete. 45min cooldown started at ' + datetime.now().strftime("%H:%M:%S") + '\n\n')
sleep(60*45)

print('Starting batch 7 at ' + datetime.now().strftime("%H:%M:%S") + '\n')
for i in PNW_metro_df.loc[30 : 34].index:    
    metro_venue_dict7[PNW_metro_df['geographic area name'][i]] = getRestaurantCoordsAndType(PNW_metro_df['lat_long'][i], id_list)

    
print('Final batch complete at ' + datetime.now().strftime("%H:%M:%S"))

<h5>Combine all dicts</h5>

In [None]:
metro_venue_dict = {**metro_venue_dict, **metro_venue_dict2, **metro_venue_dict3, **metro_venue_dict4, **metro_venue_dict5, **metro_venue_dict6, **metro_venue_dict7}

In [None]:
#mkdir('./metro_venues')
    
for key in metro_venue_dict.keys():
    metro_venue_dict[key].to_csv('./metro_venues/' + key + '.csv', index = False)

<h5>Recreate dict from csv files</h5>

In [None]:
metro_venue_dict = {}

for file in listdir('./metro_venues'):
    metro_venue_dict[file[0:-4]] = pd.read_csv('./metro_venues/' + file)

metro_venue_dict['Aberdeen'].head()

<h3>Cluster Metros</h3>

<h5>Calculate frequency of restaurant types in each metro</h5>

In [None]:
venue_frequency_df = pd.DataFrame(name_list)
venue_frequency_df.set_index(0, inplace = True)
del venue_frequency_df.index.name

for key in metro_venue_dict.keys():
    
    venue_frequency_df[key] = metro_venue_dict[key]['category'].value_counts()

venue_frequency_df.replace(np.nan, 0, inplace = True)

venue_frequency_df.head()

<h5>Normalise frequencies</h5>

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
frequency_normed_df = pd.DataFrame(min_max_scaler.fit_transform(venue_frequency_df))

frequency_normed_df.head()

<h5>Fit K-means model</h5>

In [None]:
k = 3

kmeans = KMeans(n_clusters = k).fit(frequency_normed_df.transpose())
kmeans.labels_[0:5]#sneakpeek

<h5>Insert cluster labels into metro_df</h5>

In [None]:
try:
    PNW_metro_df.insert(0, 'cluster', kmeans.labels_)
except ValueError:
    PNW_metro_df['cluster'] = kmeans.labels_
    
PNW_metro_df.head()

In [None]:
PNW_metro_df['cluster'].value_counts()

<h3>Visualise the metro clusters</h3>

<h5>Colour-code and overlay the metros on a map</h5>

In [None]:
map_clusters = folium.Map(width = '100%', height = '100%', location = [45.3, -118], zoom_start = 6, tiles = "Stamen Toner")

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []

for i in PNW_metro_df.index:
    lat = PNW_metro_df['lat_long'][i][0][0]
    long = PNW_metro_df['lat_long'][i][0][1]
    metro = PNW_metro_df['geographic area name'][i]
    pop = PNW_metro_df['population_16_up'][i]
    cluster = PNW_metro_df['cluster'][i]
    
    label = folium.Popup(str(metro) + '\nPopulation: ' + str(pop) + '\nCluster: ' + str(cluster), parse_html=True)
    
    folium.CircleMarker(
        [lat, long],
        radius=3,
        popup=label,
        color=rainbow[cluster],
        fill=True,
        fill_color=rainbow[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The map above shows the various metro/micropolitan areas in Washington, Oregon, and Idaho. The areas have been K-means clustered according to the ratio of different restaurant types in each of them.

<h3>Calculate mean and std of each cluster<h3>

In [None]:
frequency_normed_df.columns = venue_frequency_df.columns

In [None]:
frequency_normed_df.head()

<h5>Calculate mean and standard dev of each cluster. Create DF for each cluster<h5>

In [None]:
cluster_mean_std = [[[],[]], [[],[]], [[],[]]]

cluster_mean_std[0][0] = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 0]['geographic area name']].mean(axis = 1)
cluster_mean_std[0][1] = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 0]['geographic area name']].std(axis = 1)

cluster_mean_std[1][0] = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 1]['geographic area name']].mean(axis = 1)
cluster_mean_std[1][1] = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 1]['geographic area name']].std(axis = 1)

cluster_mean_std[2][0] = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 2]['geographic area name']].mean(axis = 1)
cluster_mean_std[2][1] = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 2]['geographic area name']].std(axis = 1)

In [None]:
cluster0_mean = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 0]['geographic area name']].mean(axis = 1)
cluster1_mean = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 1]['geographic area name']].mean(axis = 1)
cluster2_mean = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 2]['geographic area name']].mean(axis = 1)

cluster0_std = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 0]['geographic area name']].std(axis = 1)
cluster1_std = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 1]['geographic area name']].std(axis = 1)
cluster2_std = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 2]['geographic area name']].std(axis = 1)

In [None]:
cluster0_df = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 0]['geographic area name']]
cluster1_df = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 1]['geographic area name']]
cluster2_df = frequency_normed_df[PNW_metro_df[PNW_metro_df['cluster'] == 2]['geographic area name']]

cluster0_df.insert(0,'cluster_std', cluster0_std)
cluster0_df.insert(0,'cluster_mean', cluster0_mean)
cluster1_df.insert(0,'cluster_std', cluster1_std)
cluster1_df.insert(0,'cluster_mean', cluster1_mean)
cluster2_df.insert(0,'cluster_std', cluster2_std)
cluster2_df.insert(0,'cluster_mean', cluster2_mean)

In [None]:
cluster0_df.head()

<h5>Index cluster DFs with restaurant types</h5>

In [None]:
cluster0_df.index = venue_frequency_df.index
cluster1_df.index = venue_frequency_df.index
cluster2_df.index = venue_frequency_df.index

cluster0_df.head()

<h3>Observe what restaurant types are most under represented in what cities</h3>
    <h5>Calculate how many stds each restaurant type frequency is away from the mean in each city</h5>

In [None]:
std_list = [[],[],[]]

In [None]:
#subtract mean from frequecy
std0_df = cluster0_df[cluster0_df.columns[2:23]].subtract(cluster0_df['cluster_mean'], axis = 0)
std1_df = cluster1_df[cluster1_df.columns[2:23]].subtract(cluster1_df['cluster_mean'], axis = 0)
std2_df = cluster2_df[cluster2_df.columns[2:23]].subtract(cluster2_df['cluster_mean'], axis = 0)

#divide by std
std0_df = std0_df[std0_df.columns[:]].div(cluster0_df['cluster_std'], axis = 0)
std1_df = std1_df[std1_df.columns[:]].div(cluster1_df['cluster_std'], axis = 0)
std2_df = std2_df[std2_df.columns[:]].div(cluster2_df['cluster_std'], axis = 0)

std_list[0] = std0_df
std_list[1] = std1_df
std_list[2] = std2_df

#zero out NaNs
# std0_df.replace(np.nan, 0, inplace = True)
# std1_df.replace(np.nan, 0, inplace = True)
# std2_df.replace(np.nan, 0, inplace = True)

In [None]:
std0_df.head()

In [None]:
std_list[0].head()

<h3>Make reccomendations on what cities would be best for what restaurants</h3>
<h5>Calculate population/restaurant</h5>

In [None]:
pop_venue_ratio_list = []

for metro in PNW_metro_df['geographic area name']:
    
    pop_venue_ratio_list.append(float(PNW_metro_df[PNW_metro_df['geographic area name'] == metro]['population_16_up']) / len(metro_venue_dict[metro]))

In [None]:
PNW_metro_df.insert(5, 'pop_venue_ratio', pop_venue_ratio_list)

PNW_metro_df.head()

<h5>Find top 3 metros with highest population/venue ratios</h5>

In [None]:
top_pop_venue_ratio_df = PNW_metro_df.nlargest(3, ["pop_venue_ratio"], keep = 'all')[['cluster', 'geographic area name', 'pop_venue_ratio']]
top_pop_venue_ratio_df.reset_index(inplace = True, drop = True)
top_pop_venue_ratio_df

<h5>Find 3 least represented restaurant types in each above city</h5>

In [None]:
print(std_list[top_pop_venue_ratio_df.iloc[0]['cluster']].nsmallest(3, [top_pop_venue_ratio_df.iloc[0]['geographic area name']], keep = 'all')[top_pop_venue_ratio_df.iloc[0]['geographic area name']], '\n')
print(std_list[top_pop_venue_ratio_df.iloc[1]['cluster']].nsmallest(3, [top_pop_venue_ratio_df.iloc[1]['geographic area name']], keep = 'all')[top_pop_venue_ratio_df.iloc[1]['geographic area name']], '\n')
print(std_list[top_pop_venue_ratio_df.iloc[2]['cluster']].nsmallest(3, [top_pop_venue_ratio_df.iloc[2]['geographic area name']], keep = 'all')[top_pop_venue_ratio_df.iloc[2]['geographic area name']], '\n')

<h5>Get most underreprented restaurant type in each of the three above cities</h5>

In [None]:
top_underrepresented = [[], [], []]

In [None]:
top_underrepresented[0] = std_list[top_pop_venue_ratio_df.iloc[0]['cluster']].nsmallest(3, [top_pop_venue_ratio_df.iloc[0]['geographic area name']], keep = 'all')[top_pop_venue_ratio_df.iloc[0]['geographic area name']].index.values
top_underrepresented[1] = std_list[top_pop_venue_ratio_df.iloc[1]['cluster']].nsmallest(3, [top_pop_venue_ratio_df.iloc[1]['geographic area name']], keep = 'all')[top_pop_venue_ratio_df.iloc[1]['geographic area name']].index.values
top_underrepresented[2] = std_list[top_pop_venue_ratio_df.iloc[2]['cluster']].nsmallest(3, [top_pop_venue_ratio_df.iloc[2]['geographic area name']], keep = 'all')[top_pop_venue_ratio_df.iloc[2]['geographic area name']].index.values

<h3>Calculate the best locations for the most underrepresented restaurants, within their respective metros</h3>

<h5>Give our selected metros an "intrametro cluster" column</h5>

In [None]:
metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']]['intrametro cluster'] = 'not yet'
metro_venue_dict[top_pop_venue_ratio_df.iloc[1]['geographic area name']]['intrametro cluster'] = 'not yet'
metro_venue_dict[top_pop_venue_ratio_df.iloc[2]['geographic area name']]['intrametro cluster'] = 'not yet'

metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']].head()

In [None]:
metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']]['intrametro cluster'].value_counts()['not yet']

<h5>Cluster Restaurants in our three metros<br/>Use DBSCAN because of non-random distribution of buildings along streets</h5>

In [None]:
resto_lat_long_list = []
resto_lat_long_list.append(metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']][['lat', 'long']].to_numpy())
resto_lat_long_list.append(metro_venue_dict[top_pop_venue_ratio_df.iloc[1]['geographic area name']][['lat', 'long']].to_numpy())
resto_lat_long_list.append(metro_venue_dict[top_pop_venue_ratio_df.iloc[2]['geographic area name']][['lat', 'long']].to_numpy())

In [None]:
eps = 0.004
dbsacn_list = []
dbsacn_list.append(DBSCAN(eps = eps, min_samples = 5).fit(resto_lat_long_list[0]))
dbsacn_list.append(DBSCAN(eps = eps, min_samples = 5).fit(resto_lat_long_list[1]))
dbsacn_list.append(DBSCAN(eps = eps, min_samples = 5).fit(resto_lat_long_list[2]))

dbsacn_list[0].labels_[0:4]

In [None]:
metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']]['intrametro cluster'] = dbsacn_list[0].labels_
metro_venue_dict[top_pop_venue_ratio_df.iloc[1]['geographic area name']]['intrametro cluster'] = dbsacn_list[1].labels_
metro_venue_dict[top_pop_venue_ratio_df.iloc[2]['geographic area name']]['intrametro cluster'] = dbsacn_list[2].labels_

metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']].head()

<h5>Visualise top 10 most populated clusters</h5>

In [None]:
def getMostPopulatedClusters(metro, topXclusters = 10, viewOutliers = False):
    
    cluster_value_counts = pd.DataFrame(metro_venue_dict[metro]['intrametro cluster'].value_counts())
    clusters = list(cluster_value_counts[0:topXclusters].index)

    #Remove -1 (outliers) from clusters
    if -1 in clusters and viewOutliers == False:   
        outlier_index = clusters.index(-1)
        
        if len(cluster_value_counts) > topXclusters:
            clusters[outlier_index] = cluster_value_counts.index[topXclusters]
        else:
            clusters.pop(outlier_index)
        
    return clusters

In [None]:
clusters_list = []
clusters_list.append(getMostPopulatedClusters(top_pop_venue_ratio_df.iloc[0]['geographic area name']))
clusters_list.append(getMostPopulatedClusters(top_pop_venue_ratio_df.iloc[1]['geographic area name']))
clusters_list.append(getMostPopulatedClusters(top_pop_venue_ratio_df.iloc[2]['geographic area name']))

In [None]:
k_list = []
k_list.append(len(clusters_list[0]))
k_list.append(len(clusters_list[1]))
k_list.append(len(clusters_list[2]))

In [None]:
def visualiseVenueClusters(metro, k, clusters, mapType = "Stamen Toner"):
    
    map_ = folium.Map(width = '100%',
                      height = '100%',
                      location =  metro_df[metro_df['geographic area name'] == metro]['lat_long'].values[0][0],
                      zoom_start = 9,
                      tiles = mapType)

    x = np.arange(k)
    ys = [i + x + (i*x)**2 for i in range(k)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map
    markers_colors = []

    for i in metro_venue_dict[metro].index:

        cluster = metro_venue_dict[metro]['intrametro cluster'][i]
        if cluster in clusters:

            lat = metro_venue_dict[metro]['lat'][i]
            long = metro_venue_dict[metro]['long'][i]
            venue = metro_venue_dict[metro]['name'][i]
            category = metro_venue_dict[metro]['category'][i]
            cluster_index = clusters.index(metro_venue_dict[metro]['intrametro cluster'][i])

            #label = folium.Popup(str(metro) + '\nCluster: ' + str(cluster), parse_html=True)
            label = folium.Popup(str(venue) + ': ' + str(category) +'\n\nCluster: ' + str(cluster), parse_html=True)

            folium.CircleMarker(
                [lat, long],
                radius=3,
                popup=label,
                color=rainbow[cluster_index],
                fill=True,
                fill_color=rainbow[cluster_index],
                fill_opacity=0.5).add_to(map_)

    return map_
    

In [None]:
visualiseVenueClusters(top_pop_venue_ratio_df.iloc[0]['geographic area name'], k_list[0], clusters_list[0])

In [None]:
visualiseVenueClusters(top_pop_venue_ratio_df.iloc[1]['geographic area name'], k_list[1], clusters_list[1])

In [None]:
visualiseVenueClusters(top_pop_venue_ratio_df.iloc[2]['geographic area name'], k_list[2], clusters_list[2])

<h5>Find centroids of clusters</h5>

In [None]:
def findCentroid(an_array):
   
    count = an_array.shape[0]
    
    sum_lat = np.sum(an_array[:, 0])
    sum_long = np.sum(an_array[:, 1])
    
    lat_centroid = sum_lat/count
    long_centroid = sum_long/count
    
    return lat_centroid, long_centroid

In [None]:
def findClusterCentroid(metro_venue_df, metro_clusters):

    venuesInTopClusters_df = metro_venue_df[metro_venue_df['intrametro cluster'].isin(metro_clusters)]

    centroid_list = []
    for cluster in metro_clusters:

        cluster_lat_long_array = venuesInTopClusters_df[venuesInTopClusters_df['intrametro cluster'] == cluster][['lat', 'long']].to_numpy()
        centroid_list.append([findCentroid(cluster_lat_long_array)])

    return centroid_list

In [None]:
centroid_list = []
centroid_list.append(findClusterCentroid(metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']], clusters_list[0]))
centroid_list.append(findClusterCentroid(metro_venue_dict[top_pop_venue_ratio_df.iloc[1]['geographic area name']], clusters_list[1]))
centroid_list.append(findClusterCentroid(metro_venue_dict[top_pop_venue_ratio_df.iloc[2]['geographic area name']], clusters_list[2]))

centroid_list[0][0:5]

In [None]:
centroidDF_list = []

for i in range(k):    
    centroid_df = pd.DataFrame(clusters_list[i], pd.Series(centroid_list[i])).reset_index()
    centroid_df.rename(columns = {'index' : 'lat_long', 0 : 'cluster'}, inplace = True)
    centroidDF_list.append(centroid_df)

# seattle_centroid_df = pd.DataFrame(clusters_seattle, pd.Series(seattle_centroid_list)).reset_index()
# seattle_centroid_df.rename(columns = {'index' : 'lat_long', 0 : 'cluster'}, inplace = True)

# moses_lake_centroid_df = pd.DataFrame(clusters_moses_lake, pd.Series(moses_lake_centroid_list)).reset_index()
# moses_lake_centroid_df.rename(columns = {'index' : 'lat_long', 0 : 'cluster'}, inplace = True)

centroidDF_list[0]

<h5>Find clusters that don't contain the underrepresented restaurant type<br/>Find clusters that are furtherest away from that restaurant type<h5>

In [None]:
# venues_incluster_list = []
# clusters_with_top_list = []
# clusters_wout_top_list = []
# min_distance_list = []

# top_venues_list = []
# min_distance_df_list =[]

# for i in range(k):

#     venues_incluster_list.append(metro_venue_dict[top_pop_venue_ratio_df.iloc[i]['geographic area name']][(metro_venue_dict[top_pop_venue_ratio_df.iloc[i]['geographic area name']]['intrametro cluster'].isin(clusters_list[i]))])
#     clusters_with_top_list.append(venues_incluster_list[i][venues_incluster_list[i]['category'] == top_underrepresented[i][0]]['intrametro cluster'])
#     clusters_wout_top_list.append(pd.Series(clusters_list[i])[pd.Series(clusters_list[i]).isin(clusters_with_top_list[i]) == False].values)

#     min_distance_list.append([])
#     top_venues_list.append(venues_incluster_list[i][venues_incluster_list[i]['category'] == top_underrepresented[i][0]])
#     if len(top_venues_list) > 0:
#         for cluster in clusters_wout_top_list[i]:

#             (lat_0, long_0) = centroidDF_list[i][centroidDF_list[i]['cluster'] == cluster]['lat_long'].values[0][0]
#             distance_min = (top_venues_list[i].apply(lambda x: geodesic((lat_0, long_0), (x['lat'], x['long'])).meters, axis = 1).min())
#             print(distance_min)#***TEST***TEST***
#             min_distance_list[i].append([cluster, distance_min])
#             min_distance_df_list.append(pd.DataFrame(min_distance_list[i]).set_index(0).rename(columns = {1 : 'distance'}))
    

venues_incluster_0 = metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']][(metro_venue_dict[top_pop_venue_ratio_df.iloc[0]['geographic area name']]['intrametro cluster'].isin(clusters_list[0]))]
clusters_with_top_0 = venues_incluster_0[venues_incluster_0['category'] == top_underrepresented[0][0]]['intrametro cluster']
clusters_wout_top_0 = pd.Series(clusters_list[0])[pd.Series(clusters_list[0]).isin(clusters_with_top_0) == False].values

min_distance_list_0 = []
if len(clusters_wout_top_0) > 0:
    recommendedClusters = clusters_wout_top_0
else:
    recommendedClusters = clusters_with_top_0
    
for cluster in recommendedClusters:
    
    (lat_0, long_0) = centroidDF_list[0][centroidDF_list[0]['cluster'] == cluster]['lat_long'].values[0][0]    
    top_venues_0 = venues_incluster_0[venues_incluster_0['category'] == top_underrepresented[0][0]]
    distance_min = top_venues_0.apply(lambda x: geodesic((lat_0, long_0), (x['lat'], x['long'])).meters, axis = 1).min()
    if str(type(distance_min)) == "<class 'pandas.core.series.Series'>":
        distance_min = -1
    min_distance_list_0.append([cluster, distance_min])
    min_distance_df_0 = pd.DataFrame(min_distance_list_0).set_index(0).rename(columns = {1 : 'distance'})
    
venues_incluster_1 = metro_venue_dict[top_pop_venue_ratio_df.iloc[1]['geographic area name']][(metro_venue_dict[top_pop_venue_ratio_df.iloc[1]['geographic area name']]['intrametro cluster'].isin(clusters_list[1]))]
clusters_with_top_1 = venues_incluster_1[venues_incluster_1['category'] == top_underrepresented[1][0]]['intrametro cluster']
clusters_wout_top_1 = pd.Series(clusters_list[1])[pd.Series(clusters_list[1]).isin(clusters_with_top_1) == False].values

min_distance_list_1 = []
if len(clusters_wout_top_1) > 0:
    recommendedClusters = clusters_wout_top_1
else:
    recommendedClusters = clusters_with_top_1
    
for cluster in recommendedClusters:
    
    (lat_0, long_0) = centroidDF_list[1][centroidDF_list[1]['cluster'] == cluster]['lat_long'].values[0][0]    
    top_venues_1 = venues_incluster_1[venues_incluster_1['category'] == top_underrepresented[1][0]]
    distance_min = top_venues_1.apply(lambda x: geodesic((lat_0, long_0), (x['lat'], x['long'])).meters, axis = 1).min()    
    if str(type(distance_min)) == "<class 'pandas.core.series.Series'>":
        distance_min = -1    
    min_distance_list_1.append([cluster, distance_min])
    min_distance_df_1 = pd.DataFrame(min_distance_list_1).set_index(0).rename(columns = {1 : 'distance'})
    
venues_incluster_2 = metro_venue_dict[top_pop_venue_ratio_df.iloc[2]['geographic area name']][(metro_venue_dict[top_pop_venue_ratio_df.iloc[2]['geographic area name']]['intrametro cluster'].isin(clusters_list[2]))]
clusters_with_top_2 = venues_incluster_2[venues_incluster_2['category'] == top_underrepresented[2][1]]['intrametro cluster']
clusters_wout_top_2 = pd.Series(clusters_list[2])[pd.Series(clusters_list[2]).isin(clusters_with_top_2) == False].values

min_distance_list_2 = []
if len(clusters_wout_top_2) > 0:
    recommendedClusters = clusters_wout_top_2
else:
    recommendedClusters = clusters_with_top_2
    
for cluster in recommendedClusters:
    
    (lat_0, long_0) = centroidDF_list[2][centroidDF_list[2]['cluster'] == cluster]['lat_long'].values[0][0]    
    top_venues_2 = venues_incluster_2[venues_incluster_2['category'] == top_underrepresented[2][1]]
    distance_min = top_venues_2.apply(lambda x: geodesic((lat_0, long_0), (x['lat'], x['long'])).meters, axis = 1).min()
    if str(type(distance_min)) == "<class 'pandas.core.series.Series'>":
        distance_min = -1
    min_distance_list_2.append([cluster, distance_min])
    min_distance_df_2 = pd.DataFrame(min_distance_list_2).set_index(0).rename(columns = {1 : 'distance'})

In [None]:
def visualiseRecommendation(metro, topVenues_df, centroid_df, minDistance_df, category, mapType = "Stamen Toner"):
    
    map_ = folium.Map(width = '100%',
                      height = '100%',
                      location =  metro_df[metro_df['geographic area name'] == metro]['lat_long'].values[0][0],
                      zoom_start = 9,
                      tiles = mapType)

    for i in topVenues_df.index:

        lat = metro_venue_dict[metro]['lat'][i]
        long = metro_venue_dict[metro]['long'][i]
        venue = metro_venue_dict[metro]['name'][i]

        #label = folium.Popup(str(metro) + '\nCluster: ' + str(cluster), parse_html=True)
        label = folium.Popup(str(venue) + ': ' + str(category), parse_html=True)

        folium.CircleMarker(
            [lat, long],
            radius = 5,
            popup = label,
            color = '#FF0000',
            fill = True,
            fill_color = '#FF0000',
            fill_opacity = 1).add_to(map_)
        
    for i in minDistance_df.index:
        
        lat = centroid_df[centroid_df['cluster'] == i]['lat_long'].values[0][0][0]
        long = centroid_df[centroid_df['cluster'] == i]['lat_long'].values[0][0][1]
        distance = minDistance_df.loc[i]['distance']
        
        if distance == -1:
            label = folium.Popup('Open ' + str(category) + ' here: ' + str(round(lat, 4)) + ', ' + str(round(long, 4)) + '<br>No ' + str(category) + ' in metro.', max_width = 400)
        else:
            label = folium.Popup('Open ' + str(category) + ' here: ' + str(round(lat, 4)) + ', ' + str(round(long, 4)) + '<br>Nearest ' + str(category) + ' is ' + str(round(distance / 1000, 3)) + 'km away.', max_width = 400)
        
        folium.Circle(
            [lat, long],
            radius = 500,
            popup = label,
            color = '#6EB5FF',
            fill = True,
            fill_color = '#6EB5FF',
            fill_opacity = 0.3).add_to(map_)
        
    return map_

In [None]:
visualiseRecommendation(top_pop_venue_ratio_df.iloc[0]['geographic area name'], top_venues_0, centroidDF_list[0], min_distance_df_0, top_underrepresented[0][0])

In [None]:
visualiseRecommendation(top_pop_venue_ratio_df.iloc[1]['geographic area name'], top_venues_1, centroidDF_list[1], min_distance_df_1, top_underrepresented[1][0])

In [None]:
visualiseRecommendation(top_pop_venue_ratio_df.iloc[2]['geographic area name'], top_venues_2, centroidDF_list[2], min_distance_df_2, top_underrepresented[2][1])