K-Means Clustering of the census

In [53]:
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

In [54]:
DATA_PATH = "../data/1832_v4_preprocessed.csv"
df = pd.read_csv(DATA_PATH)
df.dtypes

nom_rue                  object
no_maison                 int64
chef_annee_naissance    float64
chef_origine             object
chef_annee_arrivee      float64
chef_vocation            object
page                      int64
division                  int64
chef_vocation_class      object
chef_origine_class       object
division_class           object
division_name            object
dtype: object

In [55]:
# Convert chef_origine and chef_vocation to categorical
df["chef_origine_class"] = df["chef_origine_class"].astype("category")
df["chef_vocation_class"] = df["chef_vocation_class"].astype("category")

df.dtypes

nom_rue                   object
no_maison                  int64
chef_annee_naissance     float64
chef_origine              object
chef_annee_arrivee       float64
chef_vocation             object
page                       int64
division                   int64
chef_vocation_class     category
chef_origine_class      category
division_class            object
division_name             object
dtype: object

In [56]:
# Check if there are nan values in those categorical columns
# Before dropping nans
df.shape[0]

3803

In [57]:
# Select only the columns we want to cluster
df_cluster = df[["chef_origine_class", "chef_vocation_class", "division"]]

# Drop nans
df_cluster = df_cluster.dropna()
df_cluster.shape[0]

2470

In [58]:
# Get the dummies
df_cluster_dummies = pd.get_dummies(df_cluster)

# Remove the division column as we want to predict it
df_cluster_dummies = df_cluster_dummies.drop(columns=["division"])

df_cluster_dummies.head()

Unnamed: 0,chef_origine_class_aigle,chef_origine_class_angleterre,chef_origine_class_aubonne,chef_origine_class_avenches,chef_origine_class_cossonay,chef_origine_class_echallens,chef_origine_class_france,chef_origine_class_fribourg,chef_origine_class_geneve,chef_origine_class_grandson,...,chef_origine_class_vaud,chef_origine_class_vevey,chef_origine_class_yverdon,chef_vocation_class_administration,chef_vocation_class_agricole,chef_vocation_class_artisanat,chef_vocation_class_commerce,chef_vocation_class_construction,chef_vocation_class_rente,chef_vocation_class_service
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [59]:
N_CLUSTERS = 18
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0)
kmeans.fit(df_cluster_dummies)

KMeans(n_clusters=18, random_state=0)

In [60]:
cluster_labels = kmeans.predict(df_cluster_dummies)
cluster_labels

array([1, 4, 4, ..., 6, 0, 0])

In [62]:
df_cluster["cluster"] = cluster_labels
df_cluster.loc[0:50]

Unnamed: 0,chef_origine_class,chef_vocation_class,division,cluster
1,rolle,artisanat,1,1
2,moudon,service,1,4
6,nyon,service,1,4
8,lausanne,rente,1,5
9,lausanne,rente,1,5
11,la_vallee,commerce,1,9
12,vaud,artisanat,1,1
14,cossonay,rente,1,16
15,cossonay,rente,1,16
16,lausanne,rente,1,5


In [63]:
# Now try AgglomerativeClustering
N_CLUSTERS = 18
agg = AgglomerativeClustering(n_clusters=N_CLUSTERS)
agg.fit(df_cluster_dummies)

cluster_labels = agg.labels_
cluster_labels

df_cluster["cluster_agg"] = cluster_labels
df_cluster.loc[0:50]

Unnamed: 0,chef_origine_class,chef_vocation_class,division,cluster,cluster_agg
1,rolle,artisanat,1,1,7
2,moudon,service,1,4,2
6,nyon,service,1,4,2
8,lausanne,rente,1,5,6
9,lausanne,rente,1,5,6
11,la_vallee,commerce,1,9,3
12,vaud,artisanat,1,1,4
14,cossonay,rente,1,16,0
15,cossonay,rente,1,16,0
16,lausanne,rente,1,5,6
