## Imports

In [84]:
import pandas as pd
import numpy as np
import requests
import json
import time
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import matplotlib.cm as cm
import matplotlib.colors as colors

# All Stores via Text Search

In [18]:
shops = []
# possible keywords (based on supported types: https://developers.google.com/maps/documentation/places/web-service/supported_types)
keyw = ['store', 'shop', 'supermarket', 'market', 'restaurant', 'diner', 'fashion', 'boutique', 'drugstore', 'bakery', 'book_store', 'coffee%20shop', 'cafe', 'clothing%20store', 'market', 'market%20place', 'food', 'food%20farm', 'grocery_or_supermarket','bicycle_store', 'convenience_store','electronics_store','florist','hardware_store','home_goods_store','pet_store','shoe_store']

# loop through all possible keywords
for i in keyw:
    params = {}
    #limitation through keywords in the query (... manhattan) and radius (10200 m from the center of Manhattan)
    # define query and get data from Google API
    url = f"https://maps.googleapis.com/maps/api/place/textsearch/json?query={i}%20manhattan&location=40.7830603,-73.9712488&radius=10200&region=US&key=AIzaSyBEindiZbV6koN4ycLksmKt4wBkmxwUoAU"

    # save query results in json file
    res = requests.get(url, params = params)
    results = json.loads(res.content)

    # extend array
    shops.extend(results['results'])

    # go through all individual results in while-loop
    while "next_page_token" in results:
         params['pagetoken'] = results['next_page_token'],
         res = requests.get(url, params = params)
         results = json.loads(res.content)
         shops.extend(results['results'])

    #payload={}
    #headers = {}

    #display results directly
    #response = requests.request("GET", url, headers=headers, data=payload)

#output
#print(response.text)

In [205]:
shops

[{'business_status': 'CLOSED_TEMPORARILY',
  'formatted_address': '334 Lexington Ave, New York, NY 10016',
  'geometry': {'location': {'lat': 40.7497237, 'lng': -73.9775697},
   'viewport': {'northeast': {'lat': 40.75102222989272,
     'lng': -73.97625762010729},
    'southwest': {'lat': 40.74832257010728, 'lng': -73.97895727989273}}},
  'icon': 'https://maps.gstatic.com/mapfiles/place_api/icons/v1/png_71/shopping-71.png',
  'icon_background_color': '#4B96F3',
  'icon_mask_base_uri': 'https://maps.gstatic.com/mapfiles/place_api/icons/v2/shopping_pinlet',
  'name': 'Small Shop and Stop',
  'permanently_closed': True,
  'place_id': 'ChIJf3hmUQFZwokRI2WfbWaIJ0A',
  'plus_code': {'compound_code': 'P2XC+VX New York',
   'global_code': '87G8P2XC+VX'},
  'rating': 0,
  'reference': 'ChIJf3hmUQFZwokRI2WfbWaIJ0A',
  'types': ['clothing_store', 'point_of_interest', 'store', 'establishment'],
  'user_ratings_total': 0},
 {'business_status': 'OPERATIONAL',
  'formatted_address': '629 6th Ave, New 

In [303]:
len(shops)

484

12 Feb, 22:50 484 results

## create a dataframe

In [206]:
# create empty arrays for longitude and latitude
shop_lat= []
shop_lon = []
shop_ids = []
shop_zip =[]

# go through all results and save them to lon and lat arrays

for i in shops:
    shop = i
    
    if shop['place_id'] not in shop_ids:
        shop_ids.append(shop['place_id'])
        try:
            shop_lat.append(shop['geometry']['location']['lat'])
        except:
            shop_lat.append('none')
        try:
            shop_lon.append(shop['geometry']['location']['lng'])
        except:
            shop_lon.append('none')
        try:
            shop_zip.append(shop['formatted_address'])
        except:
            shop_zip.append('none')
        

In [264]:
# define dataframe
df_dict= {'id':shop_ids,'longitude':shop_lon,'latitude':shop_lat, 'zip':shop_zip}

shop_df=pd.DataFrame(df_dict)

shop_df

Unnamed: 0,id,longitude,latitude,zip
0,ChIJf3hmUQFZwokRI2WfbWaIJ0A,-73.97757,40.749724,"334 Lexington Ave, New York, NY 10016"
1,ChIJlzpEM6NZwokRHEmZ4wh60aI,-73.994933,40.740343,"629 6th Ave, New York, NY 10011"
2,ChIJufARCUFZwokRVCTcUT7YFhs,-73.992589,40.744599,"115 W 25th St, New York, NY 10001"
3,ChIJX7ruPlRYwokRtPrnnFkQClQ,-73.987207,40.75962,"235 W 46th St, New York, NY 10036"
4,ChIJERgMok1ZwokRvqpg7_ou5d4,-73.947072,40.783807,"1848 2nd Ave, New York, NY 10128"
5,ChIJsdA-sa5ZwokRPNShF3C2zas,-73.988705,40.749831,"112 W 34th St, New York, NY 10120"
6,ChIJpaBx5KtZwokRhEtgddx5-XI,-73.987974,40.750886,"1333 Broadway Herald Sq, NY 10018"
7,ChIJT_1h1gJZwokRRrVRnAEWMVc,-73.971688,40.751175,"248 E 44th St, New York, NY 10017"
8,ChIJmc6MJ7BZwokRNVnUt90Aw78,-73.994527,40.748821,"245 W 29th St 8th floor, New York, NY 10001"
9,ChIJt1Be91RYwokRq2XhokPaPqI,-73.985886,40.757834,"1515 Broadway, New York, NY 10036"


In [265]:
# extract zip
for i in range(len(shop_df)):
    shop_df['zip'][i]=shop_df['zip'][i][-5:]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shop_df['zip'][i]=shop_df['zip'][i][-5:]


## Remove outliers

Remove outliers based on the zip code, shops that have their zip code out of range [10001 - 10282] will be removed

In [266]:
# create a list with indices to remove
rows_to_drop=[]

In [267]:
# change zip column type str to int - when not possible add these indexes to rows_to_drop 
for x in range(len(shop_df)):
    try:
        shop_df['zip'][x] = int(shop_df['zip'][x])
    except:
        print("not possible to change zip to int in line "+str(x))
        rows_to_drop.append(x)

not possible to change zip to int in line 355
not possible to change zip to int in line 425
not possible to change zip to int in line 426


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shop_df['zip'][x] = int(shop_df['zip'][x])


In [269]:
# drop string zips
shop_df.drop(rows_to_drop, axis = 0, inplace = True)

In [270]:
# initialise one more time a list with indices to remove
rows_to_drop=[]

In [271]:
# define indexes of shops that are not within the range 10001 - 10282
lower = shop_df['zip'] < 10001
lower_list = lower[lower].index.tolist()
for i in lower_list:
    rows_to_drop.append(i)

higher = shop_df['zip'] > 10282
higher_list = higher[higher].index.tolist()
for i in higher_list:
    rows_to_drop.append(i)    

In [277]:
# print the number of outliers
len(rows_to_drop)

21

In [278]:
# print indexes of outliers
rows_to_drop

[346,
 353,
 354,
 416,
 417,
 419,
 420,
 348,
 349,
 350,
 351,
 352,
 356,
 391,
 414,
 415,
 418,
 421,
 422,
 423,
 424]

In [273]:
# drop shops not in Manhattan
shop_df.drop(rows_to_drop, axis = 0, inplace = True)

In [274]:
shop_df

Unnamed: 0,id,longitude,latitude,zip
0,ChIJf3hmUQFZwokRI2WfbWaIJ0A,-73.97757,40.749724,10016
1,ChIJlzpEM6NZwokRHEmZ4wh60aI,-73.994933,40.740343,10011
2,ChIJufARCUFZwokRVCTcUT7YFhs,-73.992589,40.744599,10001
3,ChIJX7ruPlRYwokRtPrnnFkQClQ,-73.987207,40.75962,10036
4,ChIJERgMok1ZwokRvqpg7_ou5d4,-73.947072,40.783807,10128
5,ChIJsdA-sa5ZwokRPNShF3C2zas,-73.988705,40.749831,10120
6,ChIJpaBx5KtZwokRhEtgddx5-XI,-73.987974,40.750886,10018
7,ChIJT_1h1gJZwokRRrVRnAEWMVc,-73.971688,40.751175,10017
8,ChIJmc6MJ7BZwokRNVnUt90Aw78,-73.994527,40.748821,10001
9,ChIJt1Be91RYwokRq2XhokPaPqI,-73.985886,40.757834,10036


In [280]:
import folium
from folium.plugins import MarkerCluster

In [281]:
#Define coordinates of where we want to center our map
boulder_coords = [40.754932, -73.984016]

#Create the map
my_map = folium.Map(location = boulder_coords, zoom_start = 13)

#Display the map
my_map

In [282]:
#Define the coordinates we want our markers to be at
for i in range(len(shop_df)):
    folium.Marker([shop_df.iloc[i][2],shop_df.iloc[i][1]], popup = f"Store {i}").add_to(my_map)

#Display the map
my_map

In [283]:
# display indices of rows
ids = []
for i in range(len(shop_df)):
    ids.append(i)
shop_df['id']=ids
shop_df

Unnamed: 0,id,longitude,latitude,zip
0,0,-73.97757,40.749724,10016
1,1,-73.994933,40.740343,10011
2,2,-73.992589,40.744599,10001
3,3,-73.987207,40.75962,10036
4,4,-73.947072,40.783807,10128
5,5,-73.988705,40.749831,10120
6,6,-73.987974,40.750886,10018
7,7,-73.971688,40.751175,10017
8,8,-73.994527,40.748821,10001
9,9,-73.985886,40.757834,10036


# Cluster the results

In [284]:
# imports
from sklearn.cluster import KMeans, DBSCAN

### k-Means

In [285]:
# define NumPy-Array
X = np.zeros((len(shop_df),2))

X[:,0] = shop_df['latitude']
X[:,1] = shop_df['longitude']
X

array([[ 40.7497237, -73.9775697],
       [ 40.7403433, -73.994933 ],
       [ 40.7445994, -73.9925893],
       [ 40.7596202, -73.9872069],
       [ 40.7838073, -73.9470725],
       [ 40.7498313, -73.9887048],
       [ 40.7508858, -73.9879741],
       [ 40.7511748, -73.9716877],
       [ 40.7488206, -73.9945268],
       [ 40.7578341, -73.9858859],
       [ 40.7596445, -73.9862168],
       [ 40.7552241, -73.9793828],
       [ 40.8048198, -73.9547643],
       [ 40.7406538, -73.9849078],
       [ 40.7201718, -74.0014426],
       [ 40.7427917, -74.0063546],
       [ 40.7414158, -73.9780066],
       [ 40.745034 , -73.992338 ],
       [ 40.750831 , -73.9890961],
       [ 40.768116 , -73.9614492],
       [ 40.7493107, -73.9892985],
       [ 40.763806 , -73.991307 ],
       [ 40.7490417, -73.9843972],
       [ 40.7598896, -73.9851142],
       [ 40.7642155, -73.9784575],
       [ 40.7685811, -73.9831811],
       [ 40.7527592, -73.9979487],
       [ 40.7601775, -73.9843631],
       [ 40.7329667,

In [286]:
# apply kmeans
kmeans = KMeans(n_clusters=20, random_state=42)

In [287]:
# fit kmeans
kmeans.fit(X)
kmeans.get_params()

{'algorithm': 'auto',
 'copy_x': True,
 'init': 'k-means++',
 'max_iter': 300,
 'n_clusters': 20,
 'n_init': 10,
 'n_jobs': 'deprecated',
 'precompute_distances': 'deprecated',
 'random_state': 42,
 'tol': 0.0001,
 'verbose': 0}

In [288]:
# show cluster assignment of data items
X_clustered = pd.DataFrame(X)
X_clustered['label'] = kmeans.labels_
X_clustered

Unnamed: 0,0,1,label
0,40.749724,-73.97757,16
1,40.740343,-73.994933,13
2,40.744599,-73.992589,13
3,40.75962,-73.987207,18
4,40.783807,-73.947072,0
5,40.749831,-73.988705,8
6,40.750886,-73.987974,8
7,40.751175,-73.971688,7
8,40.748821,-73.994527,1
9,40.757834,-73.985886,18


In [289]:
# compute cluster centers
cluster_centers = kmeans.cluster_centers_
cluster_centers

array([[ 40.78035042, -73.95288937],
       [ 40.75221334, -73.995751  ],
       [ 40.86263716, -73.92372226],
       [ 40.76459641, -73.98279066],
       [ 40.71858239, -73.99475921],
       [ 40.79734217, -73.96857293],
       [ 40.72867699, -73.98694589],
       [ 40.75740863, -73.96946796],
       [ 40.75103113, -73.98680356],
       [ 40.76847701, -73.96015703],
       [ 40.81959351, -73.94826876],
       [ 40.79346966, -73.94128043],
       [ 40.78101241, -73.98058194],
       [ 40.74153147, -73.99048782],
       [ 40.72804765, -73.9988535 ],
       [ 40.70984155, -74.00836924],
       [ 40.74392951, -73.97758917],
       [ 40.74142913, -74.00204665],
       [ 40.76051061, -73.98963948],
       [ 40.75698687, -73.97813598]])

In [290]:
# show clusters in Map
# define new map
cluster_map = folium.Map(location = boulder_coords, zoom_start = 13)

#Add 20 cluster centroids to the map
for i in range(20):
    folium.Marker([cluster_centers[i,0],cluster_centers[i,1]], popup = f"cluster {i}").add_to(cluster_map)

In [291]:
#Display the map
cluster_map

In [292]:
# extract number of items in each cluster and collect them in list
members = X_clustered.groupby('label').count()
num = members[0]
num

label
0     24
1     25
2      5
3     28
4     19
5     15
6     17
7     27
8     38
9     21
10     8
11     9
12    14
13    42
14    17
15    13
16     9
17    17
18    35
19    20
Name: 0, dtype: int64

In [293]:
# compute how big cluster circle should be
occurences = folium.map.FeatureGroup()
for i in range(20):
    occurences.add_child(folium.vector_layers.CircleMarker(
    [cluster_centers[i,0],cluster_centers[i,1]],
    radius= num[i]/len(X_clustered)*200,
    color='blue',
    fill=True,
    fill_color='blue',
    fill_opacity=1,
    tooltip = str(i)))
cluster_map.add_child(occurences)

## Show which shops belong to which hub

define a function that show all data points on map and assign a colour according to cluster

In [294]:
def draw_colorful_points(X_clustered):
    # set color scheme for the clusters
    
    #define number of colors (number of unique labels in clustered data)
    no_clusters=len(np.unique(X_clustered['label']))
    x = np.arange(no_clusters)
    ys = [i + x + (i*x)**2 for i in range(no_clusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    #define a new map
    cluster_map_colored = folium.Map(location = boulder_coords, zoom_start = 13)

    # add markers to the map
    markers_colors = []
    for i in range(no_clusters):
        for j in range(len(X_clustered)):
            if (i==X_clustered['label'][j]):
                folium.vector_layers.CircleMarker(
                    [X_clustered[0][j],X_clustered[1][j]],
                    radius=5,
                    tooltip = str(i),
                    color=rainbow[i],
                    fill=True,
                    fill_color=rainbow[i],
                    fill_opacity=0.9).add_to(cluster_map_colored)

    display(cluster_map_colored)

In [295]:
draw_colorful_points(X_clustered)

### DBScan

In [296]:
# define epsilon and min stores
epsilon=0.002
min_samples= 5


In [297]:
# fit dbscan algorithm and define labels
db = DBSCAN(eps=epsilon, min_samples=min_samples).fit(X)
labels = db.labels_

In [298]:
# check # clusters and # noise points
no_clusters = len(np.unique(labels))
no_noise = np.sum(np.array(labels) == -1, axis=0)

print('Estimated no. of clusters: %d' % no_clusters)
print('Estimated no. of noise points: %d' % no_noise)

Estimated no. of clusters: 11
Estimated no. of noise points: 239


In [299]:
X_clustered_db = pd.DataFrame(X)
X_clustered_db['label'] = labels
X_clustered_db

Unnamed: 0,0,1,label
0,40.749724,-73.97757,-1
1,40.740343,-73.994933,-1
2,40.744599,-73.992589,0
3,40.75962,-73.987207,0
4,40.783807,-73.947072,-1
5,40.749831,-73.988705,0
6,40.750886,-73.987974,0
7,40.751175,-73.971688,-1
8,40.748821,-73.994527,-1
9,40.757834,-73.985886,0


In [300]:
draw_colorful_points(X_clustered_db)

In [301]:
points_of_cluster = X_clustered_db[X_clustered_db['label']==0]
centroid_of_cluster = np.mean(points_of_cluster, axis=0) 
centroid_of_cluster['label']

0.0

In [302]:
# print centroids of dbscan clusters
cluster_centers_db = pd.DataFrame()

for i in range(no_clusters-1):
    points_of_cluster = X_clustered_db[X_clustered_db['label']==i]
    centroid_of_cluster = np.mean(points_of_cluster, axis=0) 
    cluster_centers_db.loc[i,0]=centroid_of_cluster[0]
    cluster_centers_db.loc[i,1]=centroid_of_cluster[1]
    cluster_centers_db.loc[i,2]=centroid_of_cluster['label']
cluster_centers_db

Unnamed: 0,0,1,2
0,40.751764,-73.987823,0.0
1,40.763897,-73.979648,1.0
2,40.752754,-73.998601,2.0
3,40.722345,-73.996417,3.0
4,40.737011,-73.989372,4.0
5,40.756679,-73.97023,5.0
6,40.760085,-73.977128,6.0
7,40.757204,-73.989981,7.0
8,40.75573,-73.978293,8.0
9,40.778661,-73.980311,9.0
