# The optimal location for a new shopping center - Code

## Data collection and preparation

In [1]:
# list of postal codes for Milano
CAPs = [20121, 20122, 20123, 20124, 20125, 20126, 20127, 20128, 20129, 20131, 
        20132, 20133, 20134, 20135, 20136, 20137, 20138, 20139, 20141, 20142,
        20143, 20144, 20145, 20146, 20147, 20148, 20149, 20151, 20152, 20153,
        20154, 20155, 20156, 20157, 20158, 20159, 20161, 20162]

In [2]:
# construct a list that stores the data to construct the dataframe
# getting the latitude and longitude information with the Nominatim API
from geopy.geocoders import Nominatim
rows = []
geolocator = Nominatim(user_agent="coursera_project")
for c in CAPs:
    try:
        loc = geolocator.geocode("{0} Italy".format(c))
        rows.append([c, loc.address.split(",")[0], loc.latitude, loc.longitude])
    except:
        pass

In [41]:
# create the dataframe to store the information about the areas
import pandas as pd
milanAreas = pd.DataFrame(rows, columns=["CAP", "name", "latitude", "longitude"])
print(milanAreas.shape)
milanAreas.head()

(34, 4)


Unnamed: 0,CAP,name,latitude,longitude
0,20121,Municipio 1,45.46874,9.182879
1,20122,Municipio 1,45.461931,9.19636
2,20123,Municipio 1,45.463221,9.177478
3,20124,Municipio 2,45.48478,9.202348
4,20125,Greco,45.499673,9.204903


In [53]:
address = 'Piazza del Duomo, Duomo, Municipio 1, Milano, Lombardia, Italia'
loc = geolocator.geocode(address)
center_lat, center_long = loc.latitude, loc.longitude
print('The geograpical coordinate of the center of Milano are {}, {}.'.format(center_lat, center_long))

The geograpical coordinate of the center of Milano are 45.46420795, 9.190010308888969.


In [54]:
import folium
map_milano = folium.Map(location=[center_lat, center_long], zoom_start=12)
# add markers to map
for lat, lng, name, cap in zip(milanAreas['latitude'], milanAreas['longitude'], 
                                           milanAreas['name'], milanAreas['CAP']):
    label = '{}, {}'.format(cap, name)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=8, popup=label, color='red', fill=True,
                        fill_color='#FF5733', fill_opacity=0.5, parse_html=False).add_to(map_milano)  
    
map_milano

In [45]:
CLIENT_ID = '0WSSLBASAZEOMXIEUC5W4UDTLYADATYKYH1J0GBY4KE0LXVN' # your Foursquare ID
CLIENT_SECRET = 'XIFFL2HAJOUBHMQ2ABZDOUOLQE4MUIFZHEIJ05DS254KEYTD' # your Foursquare Secret
VERSION = '20200330' # Foursquare API version
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0WSSLBASAZEOMXIEUC5W4UDTLYADATYKYH1J0GBY4KE0LXVN
CLIENT_SECRET:XIFFL2HAJOUBHMQ2ABZDOUOLQE4MUIFZHEIJ05DS254KEYTD


In [20]:
import requests

def getNearbyVenues(names, latitudes, longitudes, radius=1000, categoryID=None):
    
    venues_list=[]
    
    for n, name, lat, lng in zip(range(len(names)), names, latitudes, longitudes):
            
        if categoryID is None:
            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
                CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, radius, LIMIT)
        else:
            url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}&categoryId={}'.format(
                CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, radius, LIMIT, categoryID)   
            
        # make the GET request
        results = requests.get(url).json()
        results = results["response"]['groups'][0]['items']
        if len(results) == 0:
            print("Warning! found no venue for neighboorhood nr. {1}: {0}".format(name, n))
        
        # return only relevant information for each nearby venue
        venues_list.append([ (name, lat, lng, v['venue']['name'], v['venue']['location']['lat'], 
                              v['venue']['location']['lng'], v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['CAP', 
                  'CAP Latitude', 
                  'CAP Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    print("\n Finished... a total number of {0} venues has been found!".format(len(nearby_venues)))
    return(nearby_venues)


In [21]:
# get all the venues
allVenues = getNearbyVenues(milanAreas["CAP"], milanAreas["latitude"], milanAreas["longitude"])


 Finished... a total number of 2572 venues has been found!


In [22]:
print(allVenues.shape)
allVenues.head()

(2572, 7)


Unnamed: 0,CAP,CAP Latitude,CAP Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,20121,45.46874,9.182879,Piazza Castello,45.468965,9.181312,Plaza
1,20121,45.46874,9.182879,Castello Sforzesco,45.469545,9.180424,Castle
2,20121,45.46874,9.182879,Fontana del Castello Sforzesco,45.469237,9.180917,Fountain
3,20121,45.46874,9.182879,Giovanni Cova & C.,45.468816,9.184121,Bakery
4,20121,45.46874,9.182879,Antonia,45.46889,9.184799,Accessories Store


In [25]:
# category ID of the category Shop & Service
categoryID = "4d4b7105d754a06378d81259"
# get all the Shop & Service venues
shopVenue = getNearbyVenues(milanAreas["CAP"], milanAreas["latitude"], milanAreas["longitude"], categoryID=categoryID)


 Finished... a total number of 1188 venues has been found!


In [26]:
print(shopVenue.shape)
shopVenue.head()

(1188, 7)


Unnamed: 0,CAP,CAP Latitude,CAP Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,20121,45.46874,9.182879,Antonia,45.46889,9.184799,Accessories Store
1,20121,45.46874,9.182879,Gay Odin,45.466187,9.180801,Chocolate Shop
2,20121,45.46874,9.182879,New Old Camera,45.467359,9.184171,Camera Store
3,20121,45.46874,9.182879,Fabriano Boutique,45.470025,9.184987,Stationery Store
4,20121,45.46874,9.182879,Flying Tiger,45.465945,9.18485,Gift Shop


## First step: clustering of the areas of Milano

In [69]:
# count the total number of shops per each area defined and labelled by its CAP
shopVenue_count = shopVenue[["CAP","Venue"]].groupby('CAP').count().reset_index()
shopVenue_count.columns = ["CAP", "Shop Total Nr"]
shopVenue_count.set_index("CAP", inplace=True)

# count the number of shop types present in each area and labelled by its CAP
shopVenue_countunique = shopVenue[["CAP","Venue Category"]].groupby('CAP')["Venue Category"].nunique().reset_index()
shopVenue_countunique.columns = ["CAP", "Shop Type Nr"]
shopVenue_countunique.set_index("CAP", inplace=True)

# join the two columns in a single dataframe
shopVenue_count = shopVenue_count.join(shopVenue_countunique, on="CAP", lsuffix='', rsuffix='')
shopVenue_count.reset_index()

print(shopVenue_count.shape)
shopVenue_count.head()

(34, 2)


Unnamed: 0_level_0,Shop Total Nr,Shop Type Nr
CAP,Unnamed: 1_level_1,Unnamed: 2_level_1
20121,100,34
20122,100,27
20123,81,29
20124,86,36
20125,23,16


In [70]:
# now determine the total number of venues in each area, by counting the elements in the allVenues dataframe
allVenue_count = allVenues[["CAP","Venue"]].groupby('CAP').count()
allVenue_count.columns = ["Venue Total Nr"]

print(allVenue_count.shape)
allVenue_count.head()

(34, 1)


Unnamed: 0_level_0,Venue Total Nr
CAP,Unnamed: 1_level_1
20121,100
20122,100
20123,100
20124,82
20125,82


In [71]:
# now compute the distance between each neighborhood center and the city center using the geidesic distance
# implemented in the geopy library

import geopy.distance

# coordinates of the city center
center = (center_lat, center_long)

# loop over the neighborhoods and compute the distance
rows = []
for cap, lat, long in zip(milanAreas["CAP"], milanAreas["latitude"], milanAreas["longitude"]):
    neighborhood = (lat, long)
    rows.append([cap, geopy.distance.distance(center, neighborhood).km])
    
# create the dataframe to store the information about the areas
distances = pd.DataFrame(rows, columns=["CAP", "distance"]).set_index("CAP")

print(distances.shape)
distances.head()

(34, 1)


Unnamed: 0_level_0,distance
CAP,Unnamed: 1_level_1
20121,0.751457
20122,0.557364
20123,0.986282
20124,2.48161
20125,4.110022


In [87]:
# now join all the tables in a single dataframe
df1 = shopVenue_count.join(allVenue_count, on="CAP", lsuffix='', rsuffix='').join(
    distances, on="CAP", lsuffix='', rsuffix='')
df1.reset_index(inplace=True)

print(df1.shape)
df1

(34, 5)


Unnamed: 0,CAP,Shop Total Nr,Shop Type Nr,Venue Total Nr,distance
0,20121,100,34,100,0.751457
1,20122,100,27,100,0.557364
2,20123,81,29,100,0.986282
3,20124,86,36,82,2.48161
4,20125,23,16,82,4.110022
5,20126,20,15,94,5.870387
6,20127,13,10,64,4.966618
7,20128,18,13,55,6.282979
8,20129,47,27,100,2.020685
9,20131,64,35,78,3.338398


In [88]:
# now cluster data

# set number of clusters
kclusters = 5

# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df1)

# check cluster labels generated for each row in the dataframe
kmeans.labels_


array([2, 2, 2, 2, 3, 3, 4, 4, 0, 2, 4, 3, 4, 0, 0, 0, 1, 3, 1, 0, 0, 4,
       4, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 4], dtype=int32)

In [89]:
# add the cluster column to the dataframe
df1["cluster"] = kmeans.labels_

# add the latitude and longitude columns to the dataframe
milanAreas
df1 = df1.join(milanAreas.set_index("CAP"), on="CAP", lsuffix='', rsuffix='')
df1.head()

Unnamed: 0,CAP,Shop Total Nr,Shop Type Nr,Venue Total Nr,distance,cluster,name,latitude,longitude
0,20121,100,34,100,0.751457,2,Municipio 1,45.46874,9.182879
1,20122,100,27,100,0.557364,2,Municipio 1,45.461931,9.19636
2,20123,81,29,100,0.986282,2,Municipio 1,45.463221,9.177478
3,20124,86,36,82,2.48161,2,Municipio 2,45.48478,9.202348
4,20125,23,16,82,4.110022,3,Greco,45.499673,9.204903


In [94]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

# define a function to plot map with colored clusters
def plot_map_clusters(df):
    
    # create map
    map_milano = folium.Map(location=[center_lat, center_long], zoom_start=11)
    
    # set color scheme for the clusters
    x = np.arange(kclusters)
    ys = [i + x + (i*x)**2 for i in range(kclusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map
    markers_colors = []
    for lat, lon, cap, name, cluster in zip(df['latitude'], df['longitude'], df['CAP'], df["name"], df['cluster']):
        label = folium.Popup("{0}, {1} - Cluster {2}".format(name, cap, cluster), parse_html=True)
        folium.CircleMarker([lat, lon], radius=8, popup=label, color=rainbow[cluster-1], fill=True,
                        fill_color=rainbow[cluster-1], fill_opacity=0.7, parse_html=False).add_to(map_milano)  
       
    return map_milano

In [95]:
plot_map_clusters(df1)

In [96]:
df1.loc[df1['cluster'] == 0, :] # red 2

Unnamed: 0,CAP,Shop Total Nr,Shop Type Nr,Venue Total Nr,distance,cluster,name,latitude,longitude
8,20129,47,27,100,2.020685,0,Municipio 3,45.471425,9.213726
13,20135,51,28,100,1.968077,0,Porta Romana,45.454655,9.211198
14,20136,43,23,100,1.651284,0,Porta Ticinese,45.451155,9.179927
15,20137,34,20,100,2.769024,0,Calvairate,45.455834,9.223354
19,20143,41,25,100,3.131315,0,San Cristoforo,45.44669,9.158657
20,20144,39,24,100,2.219633,0,Municipio 6,45.457383,9.163339
23,20149,38,26,100,3.36155,0,Tre Torri-Fiera,45.479528,9.152944
24,20151,37,23,90,5.447322,0,Boldinasco,45.495575,9.136475
26,20154,37,24,100,2.431102,0,Municipio 8,45.48301,9.174123
27,20155,30,19,100,3.721048,0,Bullona,45.491527,9.162498


In [97]:
df1.loc[df1['cluster'] == 1, :] # violet 5

Unnamed: 0,CAP,Shop Total Nr,Shop Type Nr,Venue Total Nr,distance,cluster,name,latitude,longitude
16,20138,8,6,21,5.042223,1,Morsenchio,45.444595,9.248136
18,20142,6,4,23,4.497375,1,Quartiere Sant'Ambrogio,45.427615,9.16547
25,20152,5,3,19,6.608144,1,Sella Nuova,45.452183,9.10727
28,20156,11,9,40,5.338463,1,Villapizzone,45.503402,9.150537
29,20157,8,3,16,6.704582,1,Quarto Oggiaro,45.512253,9.138147
32,20161,14,8,32,6.210545,1,Affori,45.518615,9.171888


In [98]:
df1.loc[df1['cluster'] == 2, :] # cyan 1

Unnamed: 0,CAP,Shop Total Nr,Shop Type Nr,Venue Total Nr,distance,cluster,name,latitude,longitude
0,20121,100,34,100,0.751457,2,Municipio 1,45.46874,9.182879
1,20122,100,27,100,0.557364,2,Municipio 1,45.461931,9.19636
2,20123,81,29,100,0.986282,2,Municipio 1,45.463221,9.177478
3,20124,86,36,82,2.48161,2,Municipio 2,45.48478,9.202348
9,20131,64,35,78,3.338398,2,Città Studi,45.48376,9.222421


In [99]:
df1.loc[df1['cluster'] == 3, :] # green 3

Unnamed: 0,CAP,Shop Total Nr,Shop Type Nr,Venue Total Nr,distance,cluster,name,latitude,longitude
4,20125,23,16,82,4.110022,3,Greco,45.499673,9.204903
5,20126,20,15,94,5.870387,3,Segnano,45.513329,9.217614
11,20133,29,19,100,2.91031,3,Municipio 4,45.467552,9.226919
17,20139,31,22,80,3.490976,3,Municipio 4,45.439911,9.218294


In [100]:
df1.loc[df1['cluster'] == 4, :] # orange 4

Unnamed: 0,CAP,Shop Total Nr,Shop Type Nr,Venue Total Nr,distance,cluster,name,latitude,longitude
6,20127,13,10,64,4.966618,4,Gorla,45.50251,9.222735
7,20128,18,13,55,6.282979,4,Precotto,45.51493,9.225498
10,20132,20,15,54,5.525985,4,Cimiano,45.500142,9.23886
12,20134,15,13,74,4.441639,4,Lambrate,45.476655,9.243984
21,20146,35,18,65,4.102013,4,Lorenteggio,45.45511,9.139183
22,20147,18,14,55,4.696594,4,Arzaga,45.45677,9.130899
33,20162,9,7,58,5.465445,4,Pratocentenaro,45.513087,9.197671


The analysis show that the areas of Milano can be classified in this way:

1. central areas, with a **large number of venues**, a **large number of shops** of diverse type (cluster 2, color cyan)
2. non central areas, with a **large number of venues**, and a **relatively high number of shops** still of diverse type (cluster 0, color red)
3. non central areas, with a **moderate number of shops** and a **large number of venues** (cluster 3, color green)
4. non central areas, with a **moderate number of shops** and a **moderate number of venues** (cluster 4, color orange)
5. non central areas, with a **small number of shops and venues** (cluster 5, violet)

Evidently, the interesting areas for opening a new shopping center would be the 3rd and 4th group, with a particular preference for the 3rd group. Areas of the 1st and 2nd group are already too crowded in terms of shops, while areas of the 5th group are too peripheral, as shown by the small number of venues.