<h1>Battle of Neighborhoods - IBM Applied Data Science Capstone</h1>

<h2>Opening a new restaurant in Milan, Italy</h2>

<h3>Importing Libraries</h3>

In [1]:
import numpy as np
import pandas as pd

import json

from geopy.geocoders import Nominatim
import geocoder

import requests

from bs4 import BeautifulSoup
from pandas.io.json import json_normalize

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium
print("Libraries imported.")

Libraries imported.


<h3>Web Scraping from Wikipedia</h3>

In [2]:
data = requests.get("https://en.wikipedia.org/wiki/Category:Districts_of_Milan").text

In [3]:
soup = BeautifulSoup(data, "html.parser")

In [4]:
rawList = []
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    rawList.append(row.text)

In [5]:
neighborhoodsList = []
for element in rawList:
    element = element.replace(" (district of Milan)", "")
    element = element.replace(", Milan", "")
    element = element.replace(" (Milan)", "")
    element = element.replace(" (district)", "")
    neighborhoodsList.append(element)

In [6]:
neighborhoodsList

['Affori',
 'Assiano',
 'Baggio',
 'Barona',
 'Bicocca',
 'Bovisa',
 'Bovisasca',
 'Brera',
 'Bruzzano',
 'Calvairate',
 'Centro Direzionale di Milano',
 'Chiaravalle',
 'Chinatown',
 'Cimiano',
 'Città Studi',
 'Comasina',
 'Conca del Naviglio',
 'Crescenzago',
 'Dergano',
 'Figino',
 'Forlanini',
 'Gallaratese',
 'Garegnano',
 'Ghisolfa',
 'Giambellino-Lorenteggio',
 'Gorla',
 'Gratosoglio',
 'Greco',
 'Lambrate',
 'Lampugnano',
 'Milano Santa Giulia',
 'Monluè',
 'Morivione',
 'Muggiano',
 'Niguarda',
 'Nosedo',
 'Ortica',
 'Ponte Lambro',
 'Porta Garibaldi',
 'Porta Genova',
 'Porta Lodovica',
 'Porta Magenta',
 'Porta Monforte',
 'Porta Nuova',
 'Porta Romana',
 'Porta Sempione',
 'Porta Tenaglia',
 'Porta Ticinese',
 'Porta Venezia',
 'Porta Vigentina',
 'Porta Vittoria',
 'Porta Volta',
 'Portello',
 'Prato Centenaro',
 'Precotto',
 'QT8',
 'Quadrilatero della moda',
 'Quartiere Feltre',
 'Quartiere Musocco',
 'Quarto Cagnino',
 'Quarto Oggiaro',
 'Quinto Romano',
 'Quintosole',

In [7]:
nbh_df = pd.DataFrame({"Neighborhood": neighborhoodsList})

nbh_df.head()

Unnamed: 0,Neighborhood
0,Affori
1,Assiano
2,Baggio
3,Barona
4,Bicocca


In [8]:
nbh_df.shape

(77, 1)

<h3>Getting geographical coordinates from Geocoder</h3>

In [9]:
def get_latlng(neighborhood):
    lat_lng_coords = None
    
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Milan, MI, Italy'.format(neighborhood), match_out_of_range=False) # I wrote Milan twice because "Milan" is also a Metropolitan Area
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [10]:
coords = [ get_latlng(neighborhood) for neighborhood in nbh_df["Neighborhood"].tolist() ]

In [11]:
coords

[[45.51410000000004, 9.173530000000028],
 [45.450604739468496, 9.061638983583682],
 [45.46324000000004, 9.092700000000036],
 [45.433710000000076, 9.15160000000003],
 [45.52149000000003, 9.213260000000048],
 [45.503130000000056, 9.161220000000071],
 [45.515550000000076, 9.150940000000048],
 [45.471490000000074, 9.187730000000045],
 [45.52825000000007, 9.180710000000033],
 [45.456180000000074, 9.224880000000041],
 [45.50198373621629, 9.264641249871804],
 [45.415680000000066, 9.239790000000028],
 [45.500860000000046, 9.265130000000056],
 [45.503460000000075, 9.248800000000074],
 [45.47708000000006, 9.226600000000076],
 [45.52631000000008, 9.158870000000036],
 [45.458008899194525, 9.17764027089855],
 [45.51054000000005, 9.24386000000004],
 [45.504110000000026, 9.176470000000052],
 [45.49234000000007, 9.078520000000026],
 [45.45975000000004, 9.24690000000004],
 [45.496710000000064, 9.114840000000072],
 [45.50469000000004, 9.136970000000076],
 [45.49631000000005, 9.169400000000053],
 [45.444

In [12]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [13]:
nbh_df['Latitude'] = df_coords['Latitude']
nbh_df['Longitude'] = df_coords['Longitude']

I noticed that the geographical coordinates of Chinatown were wrong, so because of the relevance of the district in this project I decided to insert the correct ones

In [14]:
nbh_df.at[12,'Latitude'] = 45.48133
nbh_df.at[12,'Longitude'] = 9.17523

In [15]:
nbh_df

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Affori,45.514100,9.173530
1,Assiano,45.450605,9.061639
2,Baggio,45.463240,9.092700
3,Barona,45.433710,9.151600
4,Bicocca,45.521490,9.213260
...,...,...,...
72,Turro,45.494520,9.221710
73,Vaiano Valle,45.428930,9.216200
74,Vialba,45.514910,9.128150
75,Vigentino,45.433720,9.201040


<h3>Creating a map of Milan with districs positions</h3>

In [16]:
address = 'Milan, Italy'

geolocator = Nominatim(user_agent="final-project")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Milan, Italy {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Milan, Italy 45.4668, 9.1905.


In [17]:
map_kl = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, neighborhood in zip(nbh_df['Latitude'], nbh_df['Longitude'], nbh_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_kl)  
    
map_kl

In [18]:
map_kl.save('map_kl.html')

<h3>Using Foursquare API to explore districs</h3>

In [19]:
# define Foursquare Credentials and Version
CLIENT_ID = 'MVOOP2Y4XU1LNTWB1CRXZWE5XTO3ANELZ2MX2HG3DGON5E4K' # your Foursquare ID
CLIENT_SECRET = 'VRR4O5NLL0FBFIFHVKBZPHYSMCTQ3X12EIKUZ54OBVERVS5X' # your Foursquare Secret
VERSION = '20200801' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MVOOP2Y4XU1LNTWB1CRXZWE5XTO3ANELZ2MX2HG3DGON5E4K
CLIENT_SECRET:VRR4O5NLL0FBFIFHVKBZPHYSMCTQ3X12EIKUZ54OBVERVS5X


In [20]:
radius = 1500 #1500 meters
LIMIT = 300   #top 300 venues

venues = []
for lat, long, neighborhood in zip(nbh_df['Latitude'], nbh_df['Longitude'], nbh_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [21]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(5628, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Affori,45.5141,9.17353,Esselunga,45.51238,9.173461,Supermarket
1,Affori,45.5141,9.17353,Al Paradiso Della Pizza,45.511351,9.175416,Pizza Place
2,Affori,45.5141,9.17353,Parco di Villa Litta,45.516414,9.167165,Park
3,Affori,45.5141,9.17353,Osteria del biliardo,45.514741,9.169345,Pool Hall
4,Affori,45.5141,9.17353,Biologic Bar & Restaurant,45.512302,9.178709,Hotel Bar


In [22]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Affori,67,67,67,67,67,67
Assiano,10,10,10,10,10,10
Baggio,23,23,23,23,23,23
Barona,50,50,50,50,50,50
Bicocca,100,100,100,100,100,100
...,...,...,...,...,...,...
Turro,100,100,100,100,100,100
Vaiano Valle,56,56,56,56,56,56
Vialba,38,38,38,38,38,38
Vigentino,98,98,98,98,98,98


<h5>How many categories?</h5>

In [23]:
print('There are {} categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 286 categories.


In [24]:
venues_df['VenueCategory'].unique()[:50]

array(['Supermarket', 'Pizza Place', 'Park', 'Pool Hall', 'Hotel Bar',
       'Kebab Restaurant', 'Italian Restaurant', 'Health Food Store',
       'Brewery', 'Hotel', 'Ballroom', 'Hobby Shop', 'Soccer Field',
       'Theater', 'Paper / Office Supplies Store', 'Steakhouse',
       'Art Gallery', 'Food', 'Ice Cream Shop', 'Café', 'Bar', 'Plaza',
       'Fried Chicken Joint', 'Peruvian Restaurant', 'Music Store',
       'Vegetarian / Vegan Restaurant', 'Bakery', 'Pub', 'Food Court',
       'Bookstore', 'Gym / Fitness Center', 'Diner', 'Cocktail Bar',
       'General Entertainment', 'Discount Store', 'Tram Station',
       'Rock Club', 'Restaurant', 'Hostel', 'Piadineria',
       'Football Stadium', 'Bus Stop', 'Market', 'Beer Bar',
       'Japanese Restaurant', 'Mediterranean Restaurant', 'Neighborhood',
       'Juice Bar', 'Airport', 'Athletics & Sports'], dtype=object)

<h3>Analyzing each neighborhood and its venues</h3>

In [25]:
# one hot encoding
kl_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
kl_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [kl_onehot.columns[-1]] + list(kl_onehot.columns[:-1])
kl_onehot = kl_onehot[fixed_columns]

print(kl_onehot.shape)
kl_onehot.head()

(5628, 287)


Unnamed: 0,Neighborhoods,Abruzzo Restaurant,Accessories Store,Adult Education Center,African Restaurant,Agriturismo,Airport,Airport Service,Airport Terminal,American Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Water Park,Wine Bar,Wine Shop,Winery,Women's Store
0,Affori,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Affori,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Affori,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Affori,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Affori,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
kl_grouped = kl_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(kl_grouped.shape)
kl_grouped

(77, 287)


Unnamed: 0,Neighborhoods,Abruzzo Restaurant,Accessories Store,Adult Education Center,African Restaurant,Agriturismo,Airport,Airport Service,Airport Terminal,American Restaurant,...,Video Game Store,Video Store,Vietnamese Restaurant,Volleyball Court,Warehouse Store,Water Park,Wine Bar,Wine Shop,Winery,Women's Store
0,Affori,0.000000,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
1,Assiano,0.000000,0.0,0.0,0.00,0.00,0.1,0.0,0.0,0.00,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,Baggio,0.000000,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.043478,0.000000,0.043478,0.000000,0.000000,0.0,0.0
3,Barona,0.000000,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.020000,0.000000,0.0,0.0
4,Bicocca,0.000000,0.0,0.0,0.00,0.01,0.0,0.0,0.0,0.01,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Turro,0.000000,0.0,0.0,0.01,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.010000,0.0,0.0
73,Vaiano Valle,0.017857,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
74,Vialba,0.000000,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.000000,0.026316,0.000000,0.000000,0.026316,0.0,0.0
75,Vigentino,0.010204,0.0,0.0,0.00,0.00,0.0,0.0,0.0,0.00,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.010204,0.000000,0.0,0.0


In [27]:
len(kl_grouped[kl_grouped["Chinese Restaurant"] > 0])

35

<h4>Creating a dataframe for Chinese Restaurant data only</h4>

In [28]:
kl_chre = kl_grouped[["Neighborhoods","Chinese Restaurant"]]

In [29]:
kl_chre.head()

Unnamed: 0,Neighborhoods,Chinese Restaurant
0,Affori,0.0
1,Assiano,0.0
2,Baggio,0.0
3,Barona,0.0
4,Bicocca,0.02


<h3>Clustering neighborhoods</h3>

In [30]:
# set number of clusters
kclusters = 3

kl_clustering = kl_chre.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(kl_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 2, 0, 2, 0, 0, 0], dtype=int32)

In [31]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
kl_merged = kl_chre.copy()

# add clustering labels
kl_merged["Cluster Labels"] = kmeans.labels_

In [32]:
kl_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
kl_merged.head()

Unnamed: 0,Neighborhood,Chinese Restaurant,Cluster Labels
0,Affori,0.0,0
1,Assiano,0.0,0
2,Baggio,0.0,0
3,Barona,0.0,0
4,Bicocca,0.02,2


In [33]:
kl_merged = kl_merged.join(nbh_df.set_index("Neighborhood"), on="Neighborhood")

print(kl_merged.shape)
kl_merged.head()

(77, 5)


Unnamed: 0,Neighborhood,Chinese Restaurant,Cluster Labels,Latitude,Longitude
0,Affori,0.0,0,45.5141,9.17353
1,Assiano,0.0,0,45.450605,9.061639
2,Baggio,0.0,0,45.46324,9.0927
3,Barona,0.0,0,45.43371,9.1516
4,Bicocca,0.02,2,45.52149,9.21326


In [34]:
print(kl_merged.shape)
kl_merged.sort_values(["Cluster Labels"], inplace=True)
kl_merged

(77, 5)


Unnamed: 0,Neighborhood,Chinese Restaurant,Cluster Labels,Latitude,Longitude
0,Affori,0.000000,0,45.514100,9.173530
55,QT8,0.000000,0,45.486030,9.138410
29,Lampugnano,0.000000,0,45.491630,9.121960
30,Milano Santa Giulia,0.000000,0,45.467960,9.181780
31,Monluè,0.000000,0,45.452340,9.253770
...,...,...,...,...,...
49,Porta Vigentina,0.020000,2,45.453737,9.196119
52,Portello,0.010000,2,45.490200,9.145650
53,Prato Centenaro,0.010000,2,45.506710,9.199210
75,Vigentino,0.020408,2,45.433720,9.201040


In [35]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
rainbow = ['#8000ff', '#008700', '#ff0000']

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(kl_merged['Latitude'], kl_merged['Longitude'], kl_merged['Neighborhood'], kl_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [36]:
map_clusters.save('map_clusters.html')

<h3>Exploring clusters</h3>

<h4>Cluster 0</h4>

In [37]:
kl_merged.loc[kl_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Chinese Restaurant,Cluster Labels,Latitude,Longitude
0,Affori,0.0,0,45.5141,9.17353
55,QT8,0.0,0,45.48603,9.13841
29,Lampugnano,0.0,0,45.49163,9.12196
30,Milano Santa Giulia,0.0,0,45.46796,9.18178
31,Monluè,0.0,0,45.45234,9.25377
33,Muggiano,0.0,0,45.44832,9.06401
34,Niguarda,0.0,0,45.5184,9.19201
35,Nosedo,0.0,0,45.43381,9.22137
37,Ponte Lambro,0.0,0,45.4424,9.2642
39,Porta Genova,0.0,0,45.4579,9.17457


<h4>Cluster 1</h4>

In [38]:
kl_merged.loc[kl_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Chinese Restaurant,Cluster Labels,Latitude,Longitude
51,Porta Volta,0.03,1,45.48151,9.17754
60,Quarto Oggiaro,0.029412,1,45.51674,9.1409
23,Ghisolfa,0.03,1,45.49631,9.1694
72,Turro,0.04,1,45.49452,9.22171
14,Città Studi,0.06,1,45.47708,9.2266
67,San Siro,0.027397,1,45.48074,9.12823
15,Comasina,0.025,1,45.52631,9.15887
32,Morivione,0.03,1,45.44099,9.18781
27,Greco,0.04,1,45.49702,9.21212
43,Porta Nuova,0.03,1,45.47971,9.19247


<h4>Cluster 2</h4>

In [39]:
kl_merged.loc[kl_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Chinese Restaurant,Cluster Labels,Latitude,Longitude
66,San Cristoforo sul Naviglio,0.012048,2,45.44763,9.15458
73,Vaiano Valle,0.017857,2,45.42893,9.2162
69,Stazione di Milano Centrale,0.01,2,45.48416,9.20113
68,Segnano,0.01,2,45.50691,9.20605
38,Porta Garibaldi,0.02,2,45.48065,9.18731
54,Precotto,0.02,2,45.51541,9.22553
4,Bicocca,0.02,2,45.52149,9.21326
6,Bovisasca,0.020833,2,45.51555,9.15094
13,Cimiano,0.016667,2,45.50346,9.2488
16,Conca del Naviglio,0.01,2,45.458009,9.17764


<h4>Observations:</h4>
<p>As we can see from the cluster map, there are 3 types of cluster. The cluster 1 (blue) indicates that there is an high concentration of Chinese Restaurant in that zone. The cluster 2 (green) indicates that there is a moderate number of Chinese Restaurant in the neighborhoods. The cluster 0 (red) indicates that there is a small number or 0 Chinese Restaurant in that zone. If we choose to open a new Chinese restaurant in cluster 1 zones that as we can see is concentrated in the city centre and in the most frequented areas of Milan, we should have a famous restaurant to compete with others restaurant in the zone. If we choose to open a restaurant in cluster 2 zones we can compete with other restaurant, making good offers, and with new and quality products; the best zone is the north of the city centre near Porta Garibaldi. If we choose to open in cluster 0 zones we can easily compete, because the are less or zero Chinese restaurant in the zone. </p>