# Final Project

### Import Packages

In [451]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
import requests # library to handle requests
import urllib.request

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
%matplotlib inline

# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

import bs4 as bs

print('Libraries imported!')

Libraries imported!


### Map of Milan

The first thing we have to do is to find the coordinates of Milan and plot the corresponding map.

In [452]:
address = 'Milan, MI'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Milan are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Milan are 45.4667971, 9.1904984.


The second thing is to load the geojson file and convert the types of coordinates in a format that can be processed by folium.

In [453]:
import geopandas as gpd
from shapely import wkt
# load the geojson file and convert the coordinates
df = gpd.read_file('nilzone.geojson')
df = df.to_crs(epsg='4326')
df.sort_values('NIL', inplace=True)
df.reset_index(drop=True, inplace=True)

# find the centroid for every neighborhood if we have to plot a marker in a center of neighborhood
df['centroid'] = [(wkt.loads(str(i))).centroid.coords[0] for i in df['geometry']]
df[['lon', 'lat']] = df['centroid'].apply(pd.Series)
js = df.to_json()
df.head()

Unnamed: 0,FID_1,FID_1_1,ID_NIL,NIL,AreaHA,AreaMQ,geometry,centroid,lon,lat
0,22,22,17,ADRIANO,243.15601,2431560.0,"POLYGON ((9.255545564692024 45.51924552866825,...","(9.248710978396767, 45.513415736474585)",9.248711,45.513416
1,19,19,80,AFFORI,207.08936,2070894.0,"POLYGON ((9.182386745553018 45.50665005254634,...","(9.171641173149094, 45.513255439210084)",9.171641,45.513255
2,10,10,55,BAGGIO,347.86,3478600.0,"POLYGON ((9.08381626110204 45.45078121544247, ...","(9.087525533347817, 45.458918009581666)",9.087526,45.458918
3,78,78,52,BANDE NERE,266.38809,2663881.0,"POLYGON ((9.129407793494002 45.45997323953145,...","(9.139429116752884, 45.45924451611272)",9.139429,45.459245
4,40,40,46,BARONA,200.63246,2006325.0,"POLYGON ((9.162613460140667 45.42719827409652,...","(9.156539935775527, 45.431683886004514)",9.15654,45.431684


In [454]:
# Draw the map of Milan with his neighborhood.
map_milan = folium.Map(location=[latitude, longitude], zoom_start=12)

folium.Choropleth(
    geo_data=js,
    fill_color='yellow',
    fill_opacity=0.4,
    line_weight=0.5,
    highlight=True).add_to(map_milan)

map_milan

### Family Dataset

After viewing the map of Milan, I've loaded the family dataset because I want to know what are the neighborhoods with the largest number of people that lives in. So, for better explanation, I've translate the name of the columns and the values in the columns.

In [455]:
# Importing the family dataset
df_family = pd.read_csv('ds139_popolazione_famiglie_tipologia_quartiere_2007-2015.csv', sep=';')

In [456]:
# Translating the name of the columns
df_family.columns = ['Neighborhood',
                     'Class_age_householder',
                     'Gender_householder',
                     'Number_components',
                     'Family_type',
                     'Citizenship',
                     'families_2007',
                     'families_2008',
                     'families_2009',
                     'families_2010',
                     'families_2011',
                     'families_2012',
                     'families_2013',
                     'families_2014',
                     'families_2015']

In [457]:
df_family.head().T

Unnamed: 0,0,1,2,3,4
Neighborhood,Adriano,Adriano,Adriano,Adriano,Adriano
Class_age_householder,18-34,18-34,18-34,18-34,18-34
Gender_householder,Femmine,Femmine,Femmine,Femmine,Femmine
Number_components,1,1,1,1,1
Family_type,Single,Single,Single,Single,Single
Citizenship,Altri paesi,Cina,Ecuador,Egitto,Filippine
families_2007,33,9,3,0,7
families_2008,32,7,2,0,7
families_2009,38,10,2,0,6
families_2010,38,10,4,0,6


In [458]:
lst = list(df_family['Gender_householder'].unique())+\
      list(df_family['Citizenship'].unique())+\
      list(df_family['Family_type'].unique())

lst_translated = ['Female', 'Male', 'Other_countries', 'China', 'Ecuador', 'Egypt', 'Philippines', 'Italy', 'Peru',
                  'Sri Lanka', 'Single', 'Other_types', 'Cohabiting_couple_without_children', 'Married_couple_without_children',
                  'Single_parent_with_at_least_one_minor_child', 'Cohabiting_couple_with_at_least_one_minor_child',
                  'Married_couple_with_at_least_one_minor_child', 'Married_couple_with_children_all_adult',
                  'Single_parent_with_children_all_adult', 'Cohabiting_couple_with_children_all_adult']

assert len(lst) == len(lst_translated), 'The len of the two list are different'

dict_for_translate = {k:v for k,v in zip(lst,lst_translated)}
print(dict_for_translate)

{'Femmine': 'Female', 'Maschi': 'Male', 'Altri paesi': 'Other_countries', 'Cina': 'China', 'Ecuador': 'Ecuador', 'Egitto': 'Egypt', 'Filippine': 'Philippines', 'Italia': 'Italy', 'Perù': 'Peru', 'Sri Lanka': 'Sri Lanka', 'Single': 'Single', 'Altre tipologie': 'Other_types', 'Coppia convivente senza figli': 'Cohabiting_couple_without_children', 'Coppia sposata senza figli': 'Married_couple_without_children', 'Monogenitore con almeno un figlio minorenne': 'Single_parent_with_at_least_one_minor_child', 'Coppia convivente con almeno un figlio minorenne': 'Cohabiting_couple_with_at_least_one_minor_child', 'Coppia sposata con almeno un figlio minorenne': 'Married_couple_with_at_least_one_minor_child', 'Coppia sposata con figli tutti maggiorenni': 'Married_couple_with_children_all_adult', 'Monogenitore con figli tutti maggiorenni': 'Single_parent_with_children_all_adult', 'Coppia convivente con figli tutti maggiorenni': 'Cohabiting_couple_with_children_all_adult'}


In [459]:
columns_to_translate = ['Gender_householder', 'Citizenship', 'Family_type']

for i in columns_to_translate:
    df_family[i] = df_family[i].map(dict_for_translate)

In [460]:
df_family.head()

Unnamed: 0,Neighborhood,Class_age_householder,Gender_householder,Number_components,Family_type,Citizenship,families_2007,families_2008,families_2009,families_2010,families_2011,families_2012,families_2013,families_2014,families_2015
0,Adriano,18-34,Female,1,Single,Other_countries,33,32,38,38,43,56,41,36,40
1,Adriano,18-34,Female,1,Single,China,9,7,10,10,6,8,13,11,14
2,Adriano,18-34,Female,1,Single,Ecuador,3,2,2,4,4,3,2,3,3
3,Adriano,18-34,Female,1,Single,Egypt,0,0,0,0,0,1,1,1,2
4,Adriano,18-34,Female,1,Single,Philippines,7,7,6,6,12,17,17,11,12


The family dataset has some columns that are irrelevant for this project, family columns from 2007 to 2014, so I decided to drop these columns. Moreover, I decide to create a new features that is the product of Number of components of a family type and the Number of that family type.

In [461]:
columns_to_drop = df_family.columns
df_family.drop(labels=columns_to_drop[6:-1], inplace=True, axis=1)

In [462]:
df_family.head()

Unnamed: 0,Neighborhood,Class_age_householder,Gender_householder,Number_components,Family_type,Citizenship,families_2015
0,Adriano,18-34,Female,1,Single,Other_countries,40
1,Adriano,18-34,Female,1,Single,China,14
2,Adriano,18-34,Female,1,Single,Ecuador,3
3,Adriano,18-34,Female,1,Single,Egypt,2
4,Adriano,18-34,Female,1,Single,Philippines,12


In [463]:
# Create a new feature
df_family['Count_comp_family_type'] = df_family['Number_components'] * df_family['families_2015']

In [464]:
df_pivot_citizenship = df_family.pivot_table(index='Neighborhood', 
                                              columns='Citizenship', 
                                              values='Number_components', 
                                              aggfunc=np.sum,
                                              fill_value=0.0,
                                              margins=True)

In [465]:
df_pivot_citizenship = df_pivot_citizenship.div(df_pivot_citizenship.iloc[:,-1], axis=0 ).round(2)
df_pivot_citizenship.reset_index(inplace=True)
df_pivot_citizenship.drop(88, axis=0, inplace=True)
df_pivot_citizenship.columns.name = None
df_pivot_citizenship.head()

Unnamed: 0,Neighborhood,China,Ecuador,Egypt,Italy,Other_countries,Peru,Philippines,Sri Lanka,All
0,Adriano,0.12,0.09,0.07,0.27,0.13,0.1,0.13,0.08,1.0
1,Affori,0.14,0.09,0.08,0.23,0.16,0.1,0.12,0.08,1.0
2,Baggio,0.07,0.1,0.08,0.26,0.15,0.13,0.14,0.07,1.0
3,Bande Nere,0.09,0.09,0.08,0.24,0.17,0.12,0.14,0.09,1.0
4,Barona,0.04,0.08,0.1,0.36,0.16,0.09,0.1,0.07,1.0


### Map of people that live in Milan divided per Neighborhood

So, for the first visualization I decided to visualize the different number of people that live in every neighboorhood. For the second visualization I want to visualize the density (people in squared kilometers) for every neighborhood.

In [466]:
# Grouped by neighborhood
grouped = df_family.groupby(['Neighborhood'])['Count_comp_family_type'].sum().reset_index()
grouped['ID_neighborhood'] = df['ID_NIL']

# Create the density Feature
grouped['Area_neighborhood_KMQ'] = df['AreaMQ']/1e+6
grouped['Density_pop_kmq'] = round(grouped['Count_comp_family_type']/grouped['Area_neighborhood_KMQ'], 2)
grouped.head(10)
grouped.to_csv('prova.csv')

In [467]:
grouped = grouped.join(df_pivot_citizenship.set_index('Neighborhood'), on='Neighborhood')
grouped.drop('All', axis=1, inplace=True)

In [468]:
grouped.head()

Unnamed: 0,Neighborhood,Count_comp_family_type,ID_neighborhood,Area_neighborhood_KMQ,Density_pop_kmq,China,Ecuador,Egypt,Italy,Other_countries,Peru,Philippines,Sri Lanka
0,Adriano,15220,17,2.43156,6259.36,0.12,0.09,0.07,0.27,0.13,0.1,0.13,0.08
1,Affori,24174,80,2.070894,11673.22,0.14,0.09,0.08,0.23,0.16,0.1,0.12,0.08
2,Baggio,28704,55,3.4786,8251.6,0.07,0.1,0.08,0.26,0.15,0.13,0.14,0.07
3,Bande Nere,42710,52,2.663881,16033.0,0.09,0.09,0.08,0.24,0.17,0.12,0.14,0.09
4,Barona,15815,46,2.006325,7882.57,0.04,0.08,0.1,0.36,0.16,0.09,0.1,0.07


In [469]:
map_milan = folium.Map(location=[latitude, longitude], zoom_start=12)

threshold_scale = np.linspace(grouped['Count_comp_family_type'].min(),
                              grouped['Count_comp_family_type'].max(),
                              10, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1

folium.Choropleth(
    geo_data=js,
    data=grouped,
    columns=['ID_neighborhood', 'Count_comp_family_type'],
    key_on='feature.properties.ID_NIL',
    threshold_scale=threshold_scale,
    fill_color='YlOrRd', 
    fill_opacity=0.8,
    line_weight=0.5,
    legend_name='Number of people that lives in the Neighborhood',
    reset=True,
    highlight=True).add_to(map_milan)

map_milan

In [470]:
map_milan = folium.Map(location=[latitude, longitude], zoom_start=12)

threshold_scale = np.linspace(grouped['Density_pop_kmq'].min(),
                              grouped['Density_pop_kmq'].max(),
                              10, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1

folium.Choropleth(
    geo_data=js,
    data=grouped,
    columns=['ID_neighborhood', 'Density_pop_kmq'],
    key_on='feature.properties.ID_NIL',
    threshold_scale=threshold_scale,
    fill_color='YlOrRd', 
    fill_opacity=0.8,
    line_weight=0.5,
    legend_name='Population Density for every Neighborhood',
    reset=True,
    highlight=True).add_to(map_milan)

map_milan

### Foursquare API

This section is dedicated 

In [471]:
CLIENT_ID = 'ZF0VQDSRV1CDBHSSSVQREUHPHD4ONSJTTFDLWWNGSGTXOALM' # your Foursquare ID
CLIENT_SECRET = 'SHT3VBELFATUUNJKXLC4LYVSWOASCK1S2WG2WUHNYW1QQ120' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ZF0VQDSRV1CDBHSSSVQREUHPHD4ONSJTTFDLWWNGSGTXOALM
CLIENT_SECRET:SHT3VBELFATUUNJKXLC4LYVSWOASCK1S2WG2WUHNYW1QQ120


In [472]:
def getNearbyVenues(names, ids, latitudes, longitudes, radius=2000, LIMIT=400):
    
    venues_list=[]
    count=0
    for name, idx, lat, lng in zip(names, ids, latitudes, longitudes):
        count+=1
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            idx,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood',
                  'ID_neighborhood',
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    print('Completed. Processed {} values'.format(count))
    return(nearby_venues)

In [473]:
milan_venues = getNearbyVenues(names=df['NIL'],
                               ids=df['ID_NIL'],
                               latitudes=df['lat'],
                               longitudes=df['lon'])

Completed. Processed 88 values


In [474]:
duplicated = milan_venues[milan_venues[['Venue', 'Venue Latitude', 'Venue Longitude']].duplicated(keep=False)]
duplicated.sort_values('Venue', inplace=True)
duplicated.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Neighborhood,ID_neighborhood,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
2771,GUASTALLA,4,45.462549,9.202238,& Other Stories,45.464919,9.193145,Women's Store
1722,DUOMO,1,45.463037,9.187295,& Other Stories,45.464919,9.193145,Women's Store
785,BRERA,2,45.473582,9.188504,10 Corso Como,45.481852,9.187831,Boutique
1959,FARINI,78,45.493293,9.174953,10 Corso Como,45.481852,9.187831,Boutique
4647,PARCO SEMPIONE,8,45.473461,9.176599,10 Corso Como,45.481852,9.187831,Boutique


In [475]:
from shapely.geometry import Point

lst = []
for index, row in duplicated.iterrows():
    point = Point(row['Venue Longitude'],row['Venue Latitude'])
    polygon = df['geometry'][df['ID_NIL']==row['ID_neighborhood']]
    if polygon.contains(point).all()==True:
        continue
    else:
        lst.append(row.name)
        #print('KO, the row to delete is {}'.format(row.name))

In [476]:
milan_venues.shape

(7329, 8)

In [477]:
milan_venues.drop(lst, inplace=True)
milan_venues.reset_index(inplace=True, drop=True)

## Find the frequency of each Venues Category in every Neighborhood

In [479]:
milan_onehot = pd.get_dummies(milan_venues['Venue Category'])
milan_onehot['ID_neighborhood'] = milan_venues['ID_neighborhood']

fixed_columns = [milan_onehot.columns[-1]] + list(milan_onehot.columns[:-1])
milan_onehot = milan_onehot[fixed_columns]

milan_onehot.shape

(1936, 260)

In [480]:
milan_grouped = milan_onehot.groupby('ID_neighborhood').mean().reset_index()
milan_grouped.head()

Unnamed: 0,ID_neighborhood,Accessories Store,Adult Education Center,African Restaurant,Agriturismo,Airport,Airport Lounge,Airport Terminal,American Restaurant,Amphitheater,...,Tuscan Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Volleyball Court,Water Park,Wine Bar,Wine Shop,Winery,Women's Store
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,0.0,0.013333
1,2,0.021739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0


In [481]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [482]:
num_top_venues = 50

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['ID_neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['ID_neighborhood'] = milan_grouped['ID_neighborhood']

for ind in np.arange(milan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(milan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,ID_neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,...,41th Most Common Venue,42th Most Common Venue,43th Most Common Venue,44th Most Common Venue,45th Most Common Venue,46th Most Common Venue,47th Most Common Venue,48th Most Common Venue,49th Most Common Venue,50th Most Common Venue
0,1,Boutique,Italian Restaurant,Hotel,Pizza Place,Dessert Shop,Art Museum,Ice Cream Shop,Plaza,Bookstore,...,Toy / Game Store,Art Gallery,Fast Food Restaurant,Farmers Market,Farm,Government Building,Event Space,Garden Center,Greek Restaurant,Film Studio
1,2,Hotel,Ice Cream Shop,Japanese Restaurant,Italian Restaurant,Cocktail Bar,Wine Bar,Bakery,Theater,Art Museum,...,Film Studio,Farmers Market,Furniture / Home Store,Farm,Event Space,Electronics Store,Diner,Airport Terminal,Dim Sum Restaurant,American Restaurant
2,3,Planetarium,Park,Art Gallery,Women's Store,Farmers Market,Fast Food Restaurant,Film Studio,Flea Market,Flower Shop,...,Hobby Shop,History Museum,Historic Site,Health Food Store,Hardware Store,Harbor / Marina,Gym Pool,Gym / Fitness Center,Gym,Greek Restaurant
3,4,Italian Restaurant,Park,Bakery,Hotel,Cocktail Bar,Monument / Landmark,Music School,Men's Store,Museum,...,Hockey Arena,Hobby Shop,History Museum,Historic Site,Health Food Store,Hardware Store,Harbor / Marina,Gym Pool,Gym / Fitness Center,Gym
4,5,Wine Bar,Bar,Cocktail Bar,Japanese Restaurant,Café,Restaurant,Women's Store,Flea Market,Farmers Market,...,Hostel,Hockey Arena,Hobby Shop,History Museum,Historic Site,Health Food Store,Hardware Store,Harbor / Marina,Gym Pool,Gym / Fitness Center


## Cluster the Neighborhood and plot the results

In [483]:
columns_to_add = ['Count_comp_family_type',
                  'ID_neighborhood', 
                  'Area_neighborhood_KMQ',
                  'Density_pop_kmq', 
                  'China', 
                  'Ecuador',
                  'Egypt', 
                  'Italy', 
                  'Other_countries',
                  'Peru',
                  'Philippines']

milan_grouped_clustering = grouped[columns_to_add].join(milan_grouped.set_index('ID_neighborhood'), on='ID_neighborhood')

In [484]:
from sklearn.preprocessing import Normalizer

#scale the features
scaler = Normalizer()
milan_grouped_standardized = milan_grouped_clustering.copy()

milan_grouped_standardized[['Count_comp_family_type', 
                          'Area_neighborhood_KMQ', 
                          'Density_pop_kmq']] = scaler.fit_transform(milan_grouped_clustering[['Count_comp_family_type', 
                                                                                               'Area_neighborhood_KMQ', 
                                                                                               'Density_pop_kmq']])

milan_grouped_standardized.head()

Unnamed: 0,Count_comp_family_type,ID_neighborhood,Area_neighborhood_KMQ,Density_pop_kmq,China,Ecuador,Egypt,Italy,Other_countries,Peru,...,Tuscan Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Volleyball Court,Water Park,Wine Bar,Wine Shop,Winery,Women's Store
0,0.924843,17,0.000148,0.38035,0.12,0.09,0.07,0.27,0.13,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.900508,80,7.7e-05,0.43484,0.14,0.09,0.08,0.23,0.16,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.961076,55,0.000116,0.276283,0.07,0.1,0.08,0.26,0.15,0.13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.936208,52,5.8e-05,0.351445,0.09,0.09,0.08,0.24,0.17,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0
4,0.894991,46,0.000114,0.446085,0.04,0.08,0.1,0.36,0.16,0.09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [486]:
# set number of clusters
kclusters = 5

milan_for_cluster = milan_grouped_standardized.drop('ID_neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(milan_for_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 0, 1, 4, 2, 1, 0, 2, 2, 2, 2, 0, 2,
       3, 2, 0, 0, 2, 2, 0, 1, 0, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0, 0, 1, 2,
       0, 2, 0, 2, 4, 0, 0, 0, 0, 0, 4, 2, 1, 2, 2, 2, 2, 0, 2, 2, 0, 2,
       1, 0, 2, 0, 1, 2, 2, 2, 0, 1, 2, 2, 2, 1, 1, 2, 2, 0, 2, 0, 2, 2],
      dtype=int32)

In [487]:
milan_grouped_clustering['Cluster Labels'] = kmeans.labels_
milan_grouped_clustering.head()

Unnamed: 0,Count_comp_family_type,ID_neighborhood,Area_neighborhood_KMQ,Density_pop_kmq,China,Ecuador,Egypt,Italy,Other_countries,Peru,...,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Volleyball Court,Water Park,Wine Bar,Wine Shop,Winery,Women's Store,Cluster Labels
0,15220,17,2.43156,6259.36,0.12,0.09,0.07,0.27,0.13,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,24174,80,2.070894,11673.22,0.14,0.09,0.08,0.23,0.16,0.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,28704,55,3.4786,8251.6,0.07,0.1,0.08,0.26,0.15,0.13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,42710,52,2.663881,16033.0,0.09,0.09,0.08,0.24,0.17,0.12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,0
4,15815,46,2.006325,7882.57,0.04,0.08,0.1,0.36,0.16,0.09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [488]:
grouped = milan_grouped_clustering[['ID_neighborhood','Cluster Labels']].join(grouped.set_index('ID_neighborhood'), on='ID_neighborhood')
grouped.head()

Unnamed: 0,ID_neighborhood,Cluster Labels,Neighborhood,Count_comp_family_type,Area_neighborhood_KMQ,Density_pop_kmq,China,Ecuador,Egypt,Italy,Other_countries,Peru,Philippines,Sri Lanka
0,17,0,Adriano,15220,2.43156,6259.36,0.12,0.09,0.07,0.27,0.13,0.1,0.13,0.08
1,80,0,Affori,24174,2.070894,11673.22,0.14,0.09,0.08,0.23,0.16,0.1,0.12,0.08
2,55,0,Baggio,28704,3.4786,8251.6,0.07,0.1,0.08,0.26,0.15,0.13,0.14,0.07
3,52,0,Bande Nere,42710,2.663881,16033.0,0.09,0.09,0.08,0.24,0.17,0.12,0.14,0.09
4,46,0,Barona,15815,2.006325,7882.57,0.04,0.08,0.1,0.36,0.16,0.09,0.1,0.07


In [489]:
milan_final_df = grouped.join(neighborhoods_venues_sorted.set_index('ID_neighborhood'), on='ID_neighborhood')
milan_final_df = milan_final_df.join(df[['lat', 'lon', 'ID_NIL']].set_index('ID_NIL'), on='ID_neighborhood')
milan_final_df.head()

Unnamed: 0,ID_neighborhood,Cluster Labels,Neighborhood,Count_comp_family_type,Area_neighborhood_KMQ,Density_pop_kmq,China,Ecuador,Egypt,Italy,...,43th Most Common Venue,44th Most Common Venue,45th Most Common Venue,46th Most Common Venue,47th Most Common Venue,48th Most Common Venue,49th Most Common Venue,50th Most Common Venue,lat,lon
0,17,0,Adriano,15220,2.43156,6259.36,0.12,0.09,0.07,0.27,...,Hardware Store,Cultural Center,Health Food Store,Creperie,Historic Site,Cosmetics Shop,History Museum,Hobby Shop,45.513416,9.248711
1,80,0,Affori,24174,2.070894,11673.22,0.14,0.09,0.08,0.23,...,Flea Market,Hardware Store,Greek Restaurant,Garden,Government Building,Gourmet Shop,Golf Course,Go Kart Track,45.513255,9.171641
2,55,0,Baggio,28704,3.4786,8251.6,0.07,0.1,0.08,0.26,...,Hockey Arena,Hobby Shop,History Museum,Historic Site,Health Food Store,Hardware Store,Harbor / Marina,Gym Pool,45.458918,9.087526
3,52,0,Bande Nere,42710,2.663881,16033.0,0.09,0.09,0.08,0.24,...,Hockey Arena,Hobby Shop,History Museum,Historic Site,Hardware Store,Harbor / Marina,Gym Pool,Gym / Fitness Center,45.459245,9.139429
4,46,0,Barona,15815,2.006325,7882.57,0.04,0.08,0.1,0.36,...,Frozen Yogurt Shop,Food Truck,Go Kart Track,Hobby Shop,History Museum,Historic Site,Health Food Store,Hardware Store,45.431684,9.15654


In [490]:
milan_final_df['Cluster Labels'] = milan_final_df['Cluster Labels'].astype('category')
milan_final_df['Cluster Labels'].dtype

CategoricalDtype(categories=[0, 1, 2, 3, 4], ordered=False)

In [491]:
map_milan = folium.Map(location=[latitude, longitude], zoom_start=12)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


folium.Choropleth(
    geo_data=js,
    fill_color='yellow',
    fill_opacity=0.4,
    line_weight=0.5).add_to(map_milan)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(milan_final_df['lat'], milan_final_df['lon'], milan_final_df['Neighborhood'], milan_final_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=7,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.9).add_to(map_milan)
       
map_milan