# Clustering of results

Note: this clustering can be done for any of the resulting datasets, this notebook does it for petitecouronne + paris

In [None]:
gdf_name = "results_pcparis.gpkg" 

In [None]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir)))
os.environ['USE_PYGEOS'] = '0'
import pysal.lib
import helpers as hs
from importlib import reload
import folium
import pandas as pd
import geopandas as gpd
import r5py
import shapely
import time
import datetime

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer

In [None]:
%cd ../data
gdf = gpd.read_file(gdf_name, layer="cool")

Inspect the dataset:

In [None]:
gdf.columns

Exclude forests from the dataset:

In [None]:
woods = ["CRS3035RES200mN2891600E3754800","CRS3035RES200mN2891600E3754400","CRS3035RES200mN2891800E3754000","CRS3035RES200mN2892000E3753800",
        "CRS3035RES200mN2891800E3753200","CRS3035RES200mN2891400E3753200","CRS3035RES200mN2891600E3752000","CRS3035RES200mN2891400E3752400",
        "CRS3035RES200mN2891200E3751800","CRS3035RES200mN2891000E3752200","CRS3035RES200mN2890600E3752000","CRS3035RES200mN2890000E3751400",
        "CRS3035RES200mN2890200E3752600","CRS3035RES200mN2889400E3752600","CRS3035RES200mN2890600E3753600","CRS3035RES200mN2890000E3753200",
        "CRS3035RES200mN2889800E3753600","CRS3035RES200mN2890400E3754000","CRS3035RES200mN2890200E3754000","CRS3035RES200mN2893000E3764400",
        "CRS3035RES200mN2887000E3754400","CRS3035RES200mN2887000E3754600","CRS3035RES200mN2887200E3754800","CRS3035RES200mN2887400E3754800",
        "CRS3035RES200mN2885600E3762800","CRS3035RES200mN2885600E3764600","CRS3035RES200mN2886200E3765200","CRS3035RES200mN2886400E3765400",
        "CRS3035RES200mN2885600E3765400","CRS3035RES200mN2886000E3765800","CRS3035RES200mN2886400E3766200","CRS3035RES200mN2886600E3766400",
        "CRS3035RES200mN2886400E3766800","CRS3035RES200mN2886600E3766800","CRS3035RES200mN2886600E3767000","CRS3035RES200mN2886600E3767200",
        "CRS3035RES200mN2886400E3767600","CRS3035RES200mN2886400E3768000","CRS3035RES200mN2886600E3768200","CRS3035RES200mN2886000E3767000",
        "CRS3035RES200mN2886000E3767600","CRS3035RES200mN2885600E3768000","CRS3035RES200mN2885000E3767000","CRS3035RES200mN2885000E3767400",
        "CRS3035RES200mN2885000E3768200","CRS3035RES200mN2884600E3768200","CRS3035RES200mN2884400E3768000"]

In [None]:
gdf = gdf.set_index('id')

In [None]:
interestVar = ['restaurant',
       'culture and art', 'education', 'food_shops', 'fashion_beauty',
       'supply_shops']
clusterVar = [var+"_access_normalized" for var in interestVar]#+interestVar
X = gdf[clusterVar]
X = X.drop(woods)
inmur = gdf.drop(woods)

In [None]:
X.describe()

In [None]:
visualizer = KElbowVisualizer(MiniBatchKMeans(random_state=1), k=(2,12))
visualizer.fit(X.values)
visualizer.show()

We get an elbow at around 5.

In [None]:
visualizer = KElbowVisualizer(MiniBatchKMeans(random_state=1), k=(2,12), metric="silhouette")
visualizer.fit(X.values)
visualizer.show()

In [None]:
km = MiniBatchKMeans(n_clusters=5, random_state=1)
inmur["label"] = km.fit_predict(X.values)

In [None]:
hs.folium_color_map(inmur,'label', discrete = True)

In [None]:
reload(hs)

In [None]:
km = MiniBatchKMeans(n_clusters=2, random_state=1)
inmur["label"] = km.fit_predict(X.values)

In [None]:
hs.folium_color_map(inmur,'label', discrete = True)

## Clustering on just one variable 

In [None]:
X = gdf[["CS_aggregated"]]
X = X.drop(woods)

In [None]:
visualizer = KElbowVisualizer(MiniBatchKMeans(random_state=1), k=(2,12), metric="silhouette")
visualizer.fit(X.values)
visualizer.show()

In [None]:
visualizer = KElbowVisualizer(MiniBatchKMeans(random_state=1), k=(2,12))
visualizer.fit(X.values)
visualizer.show()

In [None]:
km = MiniBatchKMeans(n_clusters=5, random_state=1)
inmur["label"] = km.fit_predict(X.values)

In [None]:
hs.folium_color_map(inmur,'label', discrete = True)

In [None]:
km = MiniBatchKMeans(n_clusters=2, random_state=1)
inmur["label"] = km.fit_predict(X.values)

In [None]:
hs.folium_color_map(inmur,'label', discrete = True)

## Clustering excluding access to housing:

interestingly enough we get education as the most important thing ??

In [None]:
inmur['Ind_mineur'] =100* (inmur['Ind_0_3']+inmur['Ind_4_5']+inmur['Ind_6_10']+inmur['Ind_11_17'])/inmur['Ind']
inmur['Ind_jeune']= 100*inmur['Ind_18_24']/inmur['Ind']
inmur['Ind_actif'] = 100*(inmur['Ind_25_39']+inmur['Ind_40_54']+inmur['Ind_55_64'])/inmur['Ind']
inmur['Ind_old'] = 100*(inmur['Ind_65_79']+inmur['Ind_80p']+inmur['Ind_55_64'])/inmur['Ind']
inmur['Men_pauv_rela'] =100* inmur['Men_pauv']/inmur['Men']
inmur['Men_1ind_rela'] =100* inmur['Men_1ind']/inmur['Men']
inmur['Men_5ind_rela'] = 100*inmur['Men_5ind']/inmur['Men']
inmur['Men_prop_rela'] = 100*inmur['Men_prop']/inmur['Men']
inmur['Men_fmp_rela'] = 100*inmur['Men_fmp']/inmur['Men']
inmur['mean_Ind_snv'] = inmur['Ind_snv']/inmur['Ind'] 

In [None]:
varSocioEco = ['Men_pauv_rela', 'Men_1ind_rela',
       'Men_5ind_rela', 'Men_prop_rela', 'Men_fmp_rela','Ind', 'mean_Ind_snv','Ind_mineur', 
       'Ind_jeune', 'Ind_actif', 'Ind_old']

In [None]:
dico_var = {'Ind' : "Nombre d’individus",
'Men' : "Nombre de ménages",
'Men_pauv' : "Nombre de ménages pauvres",
'Men_1ind' : "Nombre de ménages d’un seul individu",
'Men_5ind' : "Nombre de ménages de 5 individus ou plus",
'Men_prop' : "Nombre de ménages propriétaires",
'Men_fmp' : "Nombre de ménages monoparentaux",
'Men_pauv_rela' : "% de ménages pauvres",
'Men_1ind_rela' : "% de ménages d’un seul individu",
'Men_5ind_rela' : "% de ménages de 5 individus ou plus",
'Men_prop_rela' : "% de ménages propriétaires",
'Men_fmp_rela' : "% de ménages monoparentaux",
'Ind_snv' : "Somme des niveaux de vie winsorisés des individus",
'mean_Ind_snv' : "Moyenne des niveaux de vie winsorisés des individus",
'Ind_0_3' : "Nombre d’individus de 0 à 3 ans",
'Ind_4_5' : "Nombre d’individus de 4 à 5 ans",
'Ind_6_10' : "Nombre d’individus de 6 à 10 ans",
'Ind_11_17' : "Nombre d’individus de 11 à 17 ans",
'Ind_18_24' : "Nombre d’individus de 18 à 24 ans",
'Ind_25_39' : "Nombre d’individus de 25 à 39 ans",
'Ind_40_54' : "Nombre d’individus de 40 à 54 ans",
'Ind_55_64' : "Nombre d’individus de 55 à 64 ans",
'Ind_65_79' : "Nombre d’individus de 65 à 79 ans",
'Ind_80p' : "Nombre d’individus de 80 ans ou plus", 
'Ind_mineur' : "% d’individus de 0 à 17 ans",
'Ind_jeune': "% d’individus de 18 à 24 ans",
'Ind_actif' : "% d’individus de 25 à 64 ans",
'Ind_old' :"% d’individus de 65 ans ou plus",
'Log_access_normalized': '2SFCA Logements',
'Log_soc_access_normalized': '2SFCA Logements sociaux',
'restaurant_access_normalized': '2SFCA restaurants',
'culture and art_access_normalized': '2SFCA culture and art',
'education_access_normalized': '2SFCA education',
'food_shops_access_normalized': '2SFCA food shops',
'fashion_beauty_access_normalized': '2SFCA fashion beauty',
'supply_shops_access_normalized': '2SFCA supply shops'
}

In [None]:
base_bis = inmur[['label']+varSocioEco+clusterVar]
hs.boxplots(base_bis,clusterVar,nb_cluster,dico_var=dico_var,ttest=False)

In [None]:
varSocioEcoShort = ['Men_pauv_rela','mean_Ind_snv', 'Ind_mineur',
 'Ind_jeune',
 'Ind_actif',
 'Ind_old']

In [None]:
hs.boxplots(base_bis,varSocioEcoShort,nb_cluster,dico_var=dico_var,ttest=False)