# Defining Hubs Location Based on the Store Location

## Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import matplotlib.cm as cm
import matplotlib.colors as colors

import folium
from folium.plugins import MarkerCluster

## Use Google Maps API Text Search to get all stores in Manhattan

In [2]:
shops = []
# possible keywords (based on supported types: https://developers.google.com/maps/documentation/places/web-service/supported_types)
keyw = ['store', 'shop', 'supermarket', 'market', 'restaurant', 'diner', 'fashion', 'boutique', 'drugstore', 'bakery', 'book_store', 'coffee%20shop', 'cafe', 'clothing%20store', 'market', 'market%20place', 'food', 'food%20farm', 'grocery_or_supermarket','bicycle_store', 'convenience_store','electronics_store','florist','hardware_store','home_goods_store','pet_store','shoe_store']

# loop through all possible keywords
for i in keyw:
    params = {}
    #limitation through keywords in the query (... manhattan) and radius (10200 m from the center of Manhattan)
    # define query and get data from Google API
    url = f"https://maps.googleapis.com/maps/api/place/textsearch/json?query={i}%20manhattan&location=40.7830603,-73.9712488&radius=10200&region=US&key=AIzaSyBEindiZbV6koN4ycLksmKt4wBkmxwUoAU"

    # save query results in json file
    res = requests.get(url, params = params)
    results = json.loads(res.content)

    # extend array
    shops.extend(results['results'])

    # go through all individual results in while-loop
    while "next_page_token" in results:
         params['pagetoken'] = results['next_page_token'],
         res = requests.get(url, params = params)
         results = json.loads(res.content)
         shops.extend(results['results'])

    #payload={}
    #headers = {}

    #display results directly
    #response = requests.request("GET", url, headers=headers, data=payload)

#output
#print(response.text)

In [3]:
shops

[{'business_status': 'OPERATIONAL',
  'formatted_address': '629 6th Ave, New York, NY 10011',
  'geometry': {'location': {'lat': 40.7403433, 'lng': -73.994933},
   'viewport': {'northeast': {'lat': 40.74164222989272,
     'lng': -73.99347882010728},
    'southwest': {'lat': 40.73894257010728, 'lng': -73.99617847989273}}},
  'icon': 'https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/shopping-71.png',
  'icon_background_color': '#4B96F3',
  'icon_mask_base_uri': 'https://maps.gstatic.com/mapfiles/place_api/icons/v2/shopping_pinlet',
  'name': 'The Container Store',
  'opening_hours': {'open_now': False},
  'photos': [{'height': 2992,
    'html_attributions': ['<a href="https://maps.google.com/maps/contrib/110760471164144229254">Pichsinee Uansiri</a>'],
    'photo_reference': 'Aap_uEDAl6u0ZtHnLCyUgIRsZpYyI9qvYa0RghpyfG_zS1lQsN1Ku11_apnj3S9F-jgs5yQGzxSybWCr0lBO7DLJS37Winx3NwDgkOHXjc7pMM_G9TaRC-5by5Q-IDsu79oJp2L0U86Zf7Holt2tjAfmU97gP1eLB5fmYc06_td8MK-ieNhf',
    'width': 5312}],
 

In [4]:
len(shops)

484

Depending on the time when the query is run we get different number of stores but it is always around 500 (e.g. 484, 473, 478 are the actual numbers we got).

## Create a dataframe

In [5]:
# create empty arrays for longitude and latitude
shop_lat= []
shop_lon = []
shop_ids = []
shop_zip =[]

# go through all results and save them to lon and lat arrays

for i in shops:
    shop = i
    
    if shop['place_id'] not in shop_ids:
        shop_ids.append(shop['place_id'])
        try:
            shop_lat.append(shop['geometry']['location']['lat'])
        except:
            shop_lat.append('none')
        try:
            shop_lon.append(shop['geometry']['location']['lng'])
        except:
            shop_lon.append('none')
        try:
            shop_zip.append(shop['formatted_address'])
        except:
            shop_zip.append('none')
        

In [6]:
# define dataframe
df_dict= {'id':shop_ids,'longitude':shop_lon,'latitude':shop_lat, 'zip':shop_zip}

shop_df=pd.DataFrame(df_dict)

shop_df

Unnamed: 0,id,longitude,latitude,zip
0,ChIJlzpEM6NZwokRHEmZ4wh60aI,-73.994933,40.740343,"629 6th Ave, New York, NY 10011"
1,ChIJf3hmUQFZwokRI2WfbWaIJ0A,-73.977570,40.749724,"334 Lexington Ave, New York, NY 10016"
2,ChIJufARCUFZwokRVCTcUT7YFhs,-73.992589,40.744599,"115 W 25th St, New York, NY 10001"
3,ChIJX7ruPlRYwokRtPrnnFkQClQ,-73.987207,40.759620,"235 W 46th St, New York, NY 10036"
4,ChIJERgMok1ZwokRvqpg7_ou5d4,-73.947072,40.783807,"1848 2nd Ave, New York, NY 10128"
...,...,...,...,...
425,ChIJBaneg5hz_UYR9liUx6naTKg,18.648729,54.343938,"Żabi Kruk 10, 80-822 Gdańsk, Poland"
426,ChIJi_IYQtHO5zsRL16FQNvidIA,72.826048,19.004561,"Gate No 4, Trade View Building, Utopia City, P..."
427,ChIJjd8EoxYSdkgR_buZkt81U_s,-0.389204,51.912228,"Butterfield Business Park, 2A The Quad, Luton ..."
428,ChIJhyqEEoaxe0gRLjP_9vowfgE,-2.338281,53.463596,"24 Barton Dock Rd, Trafford Park, Stretford, M..."


In [7]:
# extract zip
for i in range(len(shop_df)):
    shop_df['zip'][i]=shop_df['zip'][i][-5:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shop_df['zip'][i]=shop_df['zip'][i][-5:]


## Remove outliers

Remove outliers based on the zip code, shops that have their zip code out of range [10001 - 10282] will be removed

In [8]:
# create a list with indices to remove
rows_to_drop=[]

In [9]:
# change zip column type str to int - when not possible add these indexes to rows_to_drop 
for x in range(len(shop_df)):
    try:
        shop_df['zip'][x] = int(shop_df['zip'][x])
    except:
        print("not possible to change zip to int in line "+str(x))
        rows_to_drop.append(x)

not possible to change zip to int in line 354
not possible to change zip to int in line 355
not possible to change zip to int in line 357
not possible to change zip to int in line 358
not possible to change zip to int in line 360
not possible to change zip to int in line 361
not possible to change zip to int in line 362
not possible to change zip to int in line 423
not possible to change zip to int in line 424
not possible to change zip to int in line 425
not possible to change zip to int in line 426
not possible to change zip to int in line 427
not possible to change zip to int in line 428
not possible to change zip to int in line 429


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shop_df['zip'][x] = int(shop_df['zip'][x])


In [10]:
# drop string zips
shop_df.drop(rows_to_drop, axis = 0, inplace = True)

In [11]:
# initialise one more time a list with indices to remove
rows_to_drop=[]

In [12]:
# define indexes of shops that are not within the range 10001 - 10282
lower = shop_df['zip'] < 10001
lower_list = lower[lower].index.tolist()
for i in lower_list:
    rows_to_drop.append(i)

higher = shop_df['zip'] > 10282
higher_list = higher[higher].index.tolist()
for i in higher_list:
    rows_to_drop.append(i)    

In [13]:
# print the number of outliers
len(rows_to_drop)

15

In [14]:
# print indexes of outliers
rows_to_drop

[349, 352, 359, 418, 350, 351, 353, 356, 397, 416, 417, 419, 420, 421, 422]

In [15]:
# drop shops not in Manhattan
shop_df.drop(rows_to_drop, axis = 0, inplace = True)

In [16]:
shop_df

Unnamed: 0,id,longitude,latitude,zip
0,ChIJlzpEM6NZwokRHEmZ4wh60aI,-73.994933,40.740343,10011
1,ChIJf3hmUQFZwokRI2WfbWaIJ0A,-73.977570,40.749724,10016
2,ChIJufARCUFZwokRVCTcUT7YFhs,-73.992589,40.744599,10001
3,ChIJX7ruPlRYwokRtPrnnFkQClQ,-73.987207,40.759620,10036
4,ChIJERgMok1ZwokRvqpg7_ou5d4,-73.947072,40.783807,10128
...,...,...,...,...
411,ChIJuV2ik6lZwokROkSPeQIlV3Y,-73.958663,40.772844,10075
412,ChIJpaSkN4pYwokRfvh_So8V-K4,-73.980556,40.778333,10023
413,ChIJr4XGWcZZwokRmpKM5lcKfKs,-73.993631,40.726978,10012
414,ChIJCyYZYoZYwokR0JPksGwM_ag,-73.978402,40.786443,10024


In [17]:
#Define coordinates of where we want to center our map
boulder_coords = [40.754932, -73.984016]

#Create the map
my_map = folium.Map(location = boulder_coords, zoom_start = 13)

#Display the map
my_map

In [18]:
#Define the coordinates we want our markers to be at
for i in range(len(shop_df)):
    folium.Marker([shop_df.iloc[i][2],shop_df.iloc[i][1]], popup = f"Store {i}").add_to(my_map)

#Display the map
my_map

In [19]:
# display indices of rows
ids = []
for i in range(len(shop_df)):
    ids.append(i)
shop_df['id']=ids
shop_df

Unnamed: 0,id,longitude,latitude,zip
0,0,-73.994933,40.740343,10011
1,1,-73.977570,40.749724,10016
2,2,-73.992589,40.744599,10001
3,3,-73.987207,40.759620,10036
4,4,-73.947072,40.783807,10128
...,...,...,...,...
411,396,-73.958663,40.772844,10075
412,397,-73.980556,40.778333,10023
413,398,-73.993631,40.726978,10012
414,399,-73.978402,40.786443,10024


# Cluster the results

In [20]:
# imports
from sklearn.cluster import KMeans, DBSCAN

### k-Means

In [21]:
# define NumPy-Array
X = np.zeros((len(shop_df),2))

X[:,0] = shop_df['latitude']
X[:,1] = shop_df['longitude']
X

array([[ 40.7403433, -73.994933 ],
       [ 40.7497237, -73.9775697],
       [ 40.7445994, -73.9925893],
       [ 40.7596202, -73.9872069],
       [ 40.7838073, -73.9470725],
       [ 40.7498313, -73.9887048],
       [ 40.7508858, -73.9879741],
       [ 40.7488206, -73.9945268],
       [ 40.7511748, -73.9716877],
       [ 40.7578341, -73.9858859],
       [ 40.7406538, -73.9849078],
       [ 40.7596445, -73.9862168],
       [ 40.7552241, -73.9793828],
       [ 40.8048198, -73.9547643],
       [ 40.7224256, -73.9978386],
       [ 40.7201718, -74.0014426],
       [ 40.7427917, -74.0063546],
       [ 40.745034 , -73.992338 ],
       [ 40.768116 , -73.9614492],
       [ 40.750831 , -73.9890961],
       [ 40.7493107, -73.9892985],
       [ 40.763806 , -73.991307 ],
       [ 40.7490417, -73.9843972],
       [ 40.7598896, -73.9851142],
       [ 40.7381688, -73.9898619],
       [ 40.7642155, -73.9784575],
       [ 40.7281384, -73.9947995],
       [ 40.7685811, -73.9831811],
       [ 40.7527592,

In [22]:
# apply kmeans
kmeans = KMeans(n_clusters=20, random_state=42)

In [23]:
# fit kmeans
kmeans.fit(X)
kmeans.get_params()

{'algorithm': 'auto',
 'copy_x': True,
 'init': 'k-means++',
 'max_iter': 300,
 'n_clusters': 20,
 'n_init': 10,
 'n_jobs': 'deprecated',
 'precompute_distances': 'deprecated',
 'random_state': 42,
 'tol': 0.0001,
 'verbose': 0}

In [24]:
# show cluster assignment of data items
X_clustered = pd.DataFrame(X)
X_clustered['label'] = kmeans.labels_
X_clustered

Unnamed: 0,0,1,label
0,40.740343,-73.994933,1
1,40.749724,-73.977570,9
2,40.744599,-73.992589,1
3,40.759620,-73.987207,18
4,40.783807,-73.947072,10
...,...,...,...
396,40.772844,-73.958663,2
397,40.778333,-73.980556,16
398,40.726978,-73.993631,19
399,40.786443,-73.978402,16


In [25]:
# compute cluster centers
cluster_centers = kmeans.cluster_centers_
cluster_centers

array([[ 40.86263716, -73.92372226],
       [ 40.74130622, -73.98956621],
       [ 40.76803387, -73.96081835],
       [ 40.76382071, -73.98283927],
       [ 40.71859133, -73.99470904],
       [ 40.75235425, -73.99542369],
       [ 40.80661725, -73.95470375],
       [ 40.75737817, -73.96955826],
       [ 40.79683988, -73.96952875],
       [ 40.75468597, -73.97764748],
       [ 40.77987374, -73.95277851],
       [ 40.70984155, -74.00836924],
       [ 40.75063505, -73.98637928],
       [ 40.82309972, -73.94661757],
       [ 40.74131487, -74.00095497],
       [ 40.79346966, -73.94128043],
       [ 40.78090353, -73.98034787],
       [ 40.72793761, -73.98575476],
       [ 40.76067951, -73.99065587],
       [ 40.72818018, -73.99772343]])

In [26]:
# show clusters in Map
# define new map
cluster_map = folium.Map(location = boulder_coords, zoom_start = 13)

#Add 20 cluster centroids to the map
for i in range(20):
    folium.Marker([cluster_centers[i,0],cluster_centers[i,1]], popup = f"cluster {i}").add_to(cluster_map)

In [27]:
#Display the map
cluster_map

In [28]:
# extract number of items in each cluster and collect them in list
members = X_clustered.groupby('label').count()
num = members[0]
num

label
0      5
1     46
2     19
3     34
4     20
5     29
6      2
7     26
8     13
9     23
10    26
11    13
12    39
13     6
14    17
15     9
16    12
17    17
18    33
19    12
Name: 0, dtype: int64

## Drop clusters that have less than 10 members

In [29]:
# define labels of cluster that have less than 10 members 
small_hubs = num < 10
small_hubs_list = small_hubs[small_hubs].index.tolist()
small_hubs_list

[0, 6, 13, 15]

In [30]:
#drop hubs with less than 10 shops
X_clustered = X_clustered[~X_clustered.label.isin(small_hubs_list)]
X_clustered=X_clustered.reset_index(drop=True)

In [31]:
# extract number of items in each cluster and collect them in list
members = X_clustered.groupby('label').count()
num = members[0]
num

label
1     46
2     19
3     34
4     20
5     29
7     26
8     13
9     23
10    26
11    13
12    39
14    17
16    12
17    17
18    33
19    12
Name: 0, dtype: int64

In [32]:
new_cluster_centers=np.delete(cluster_centers,small_hubs_list, axis = 0)

In [33]:
new_cluster_centers

array([[ 40.74130622, -73.98956621],
       [ 40.76803387, -73.96081835],
       [ 40.76382071, -73.98283927],
       [ 40.71859133, -73.99470904],
       [ 40.75235425, -73.99542369],
       [ 40.75737817, -73.96955826],
       [ 40.79683988, -73.96952875],
       [ 40.75468597, -73.97764748],
       [ 40.77987374, -73.95277851],
       [ 40.70984155, -74.00836924],
       [ 40.75063505, -73.98637928],
       [ 40.74131487, -74.00095497],
       [ 40.78090353, -73.98034787],
       [ 40.72793761, -73.98575476],
       [ 40.76067951, -73.99065587],
       [ 40.72818018, -73.99772343]])

## Visualise the size of clusters

In [34]:
# compute how big cluster circle should be
occurences = folium.map.FeatureGroup()
for i in range(20):
    occurences.add_child(folium.vector_layers.CircleMarker(
    [cluster_centers[i,0],cluster_centers[i,1]],
    radius= num[i]/len(X_clustered)*400,
    color='blue',
    fill=True,
    fill_color='blue',
    fill_opacity=1,
    tooltip = str(i)))
cluster_map.add_child(occurences)

KeyError: 0

## Visualise shops belonging to certain hub

Define a function that shows all data points on a map and assigns a colour according to cluster

In [35]:
def draw_colorful_points(X_clustered):
    # set color scheme for the clusters
    
    #define number of colors (number of unique labels in clustered data)
    no_clusters=len(np.unique(X_clustered['label']))
    x = np.arange(no_clusters)
    ys = [i + x + (i*x)**2 for i in range(no_clusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    #define a new map
    cluster_map_colored = folium.Map(location = boulder_coords, zoom_start = 13)

    # add markers to the map
    markers_colors = []
    for i in range(no_clusters):
        for j in range(len(X_clustered)):
            if (i==X_clustered['label'][j]):
                folium.vector_layers.CircleMarker(
                    [X_clustered[0][j],X_clustered[1][j]],
                    radius=5,
                    tooltip = str(i),
                    color=rainbow[i],
                    fill=True,
                    fill_color=rainbow[i],
                    fill_opacity=0.9).add_to(cluster_map_colored)

    display(cluster_map_colored)

In [36]:
draw_colorful_points(X_clustered)

### DBScan

In [37]:
# define epsilon and min stores
epsilon=0.0026
min_samples= 10


In [38]:
# fit dbscan algorithm and define labels
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_

In [39]:
# check # clusters and # noise points
no_clusters = len(np.unique(labels))
no_noise = np.sum(np.array(labels) == -1, axis=0)

print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)

Estimated no. of clusters: 5
Estimated no. of noise points: 286


In [40]:
X_clustered_db = pd.DataFrame(X)
X_clustered_db['label'] = labels
X_clustered_db

Unnamed: 0,0,1,label
0,40.740343,-73.994933,-1
1,40.749724,-73.977570,-1
2,40.744599,-73.992589,1
3,40.759620,-73.987207,0
4,40.783807,-73.947072,-1
...,...,...,...
396,40.772844,-73.958663,-1
397,40.778333,-73.980556,-1
398,40.726978,-73.993631,-1
399,40.786443,-73.978402,-1


In [41]:
draw_colorful_points(X_clustered_db)

In [42]:
points_of_cluster = X_clustered_db[X_clustered_db['label']==0]
centroid_of_cluster = np.mean(points_of_cluster, axis=0) 
centroid_of_cluster['label']

0.0

In [43]:
# print centroids of dbscan clusters
cluster_centers_db = pd.DataFrame()

for i in range(no_clusters-1):
    points_of_cluster = X_clustered_db[X_clustered_db['label']==i]
    centroid_of_cluster = np.mean(points_of_cluster, axis=0) 
    cluster_centers_db.loc[i,0]=centroid_of_cluster[0]
    cluster_centers_db.loc[i,1]=centroid_of_cluster[1]
    cluster_centers_db.loc[i,2]=centroid_of_cluster['label']
cluster_centers_db

Unnamed: 0,0,1,2
0,40.7609,-73.985037,0.0
1,40.747688,-73.988351,1.0
2,40.757476,-73.970984,2.0
3,40.764096,-73.979173,3.0


In [44]:
# extract number of items in each cluster and collect them in list
members = X_clustered_db.groupby('label').count()
num = members[0]
num

label
-1    286
 0     27
 1     70
 2     11
 3      7
Name: 0, dtype: int64

It is problematic to find the apropriate epsilon for dbscan. If epsilon is too small, then the clusters are small and we get many noisy points. But then if we increase the epsilon, the number of clusters decreases, which is not good as we want to have more hubs with less shops. To see the problem, run first the code with epsilon set to 0.002 and then with 0.003. The best trade-off seems to be 0.0026. With these settings we have 4 hubs and 279 noisy points.

## Save coordinates of clusters generated with k-means

In [45]:
new_cluster_centers

array([[ 40.74130622, -73.98956621],
       [ 40.76803387, -73.96081835],
       [ 40.76382071, -73.98283927],
       [ 40.71859133, -73.99470904],
       [ 40.75235425, -73.99542369],
       [ 40.75737817, -73.96955826],
       [ 40.79683988, -73.96952875],
       [ 40.75468597, -73.97764748],
       [ 40.77987374, -73.95277851],
       [ 40.70984155, -74.00836924],
       [ 40.75063505, -73.98637928],
       [ 40.74131487, -74.00095497],
       [ 40.78090353, -73.98034787],
       [ 40.72793761, -73.98575476],
       [ 40.76067951, -73.99065587],
       [ 40.72818018, -73.99772343]])

In [49]:
df=pd.DataFrame(new_cluster_centers, columns=['latitude', 'longitude'])

In [50]:
df

Unnamed: 0,latitude,longitude
0,40.741306,-73.989566
1,40.768034,-73.960818
2,40.763821,-73.982839
3,40.718591,-73.994709
4,40.752354,-73.995424
5,40.757378,-73.969558
6,40.79684,-73.969529
7,40.754686,-73.977647
8,40.779874,-73.952779
9,40.709842,-74.008369


In [51]:
df.to_csv('coordinates_stores.csv')