In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import silhouette_score

warnings.filterwarnings("ignore")

In [14]:
df_main = pd.read_csv("zip_code_age_lg_wealth_osm_percentage.csv", index_col="Unnamed: 0")
df_main = df_main.drop(index=10020)  # missing all the data
print(f'shape: {df_main.shape}')
df_main.head(2)

shape: (62, 27)


Unnamed: 0,female[0-9],male[0-9],female[10-19],male[10-19],female[20-29],male[20-29],female[30-39],male[30-39],female[40-49],male[40-49],...,"50,000-75,000","75,000-100,000","100,000-200,000","200,000-...",stadium_count,shoes_count,restaurant_count,gym_count,clothes_count,clinic_count
10026,0.059201,0.056966,0.058436,0.059289,0.094345,0.08111,0.092021,0.078905,0.083287,0.079199,...,0.136957,0.078261,0.1,0.053261,0.0,0.0,18.0,0.0,0.0,1.0
10027,0.050966,0.050011,0.077194,0.067228,0.13114,0.10853,0.077478,0.069054,0.066776,0.061601,...,0.126863,0.06434,0.086514,0.046529,0.0,2.0,25.0,4.0,5.0,1.0


In [13]:
for osm in ['stadium', 'shoes', 'restaurant', 'gym', 'clothes', 'clinic']:
    col_name = f'{osm}_count'
    sparsity = np.sum(df_main[col_name]==0) / df_main.shape[0]
    print(f'{osm} sparsity: {sparsity}')

stadium sparsity: 0.9193548387096774
shoes sparsity: 0.43548387096774194
restaurant sparsity: 0.04838709677419355
gym sparsity: 0.5
clothes sparsity: 0.3870967741935484
clinic sparsity: 0.43548387096774194


In [16]:
mask = df_main.columns[df_main.columns!='stadium_count']
df_main = df_main[mask]
df_main.shape

(62, 26)

## Design a Model

In [22]:
params = {"kmeans__n_clusters": [2, 3, 4, 5, 10]}
pipe = make_pipeline(StandardScaler(), KMeans())
kmeans = GridSearchCV(pipe, param_grid=params, cv=10)
kmeans.fit(df_main)
kmeans.best_params_

{'kmeans__n_clusters': 10}