## KMEANS ##

In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from pathlib import Path

# load data
DATA_PATH = ROOT = Path("/Users/maximeducotterd/Desktop/DSAP_intercantonal_dynamics")
DATA_PATH = ROOT / "data" / "databasecsv.csv"
df = pd.read_csv(DATA_PATH, sep=";")
df.columns = df.columns.str.strip()
print(f"Loaded {len(df)} rows")

# vars for kmeans
feature_cols = [
    "Z_score_rent",
    "avg_income_zscore",
    "z-score_unemployment",
    "Z-score-ownrrate",
    "Z-score-debt",
    "shockexposure_zscore"
]

# drop NaNs only for these cols
df_clean = df.dropna(subset=feature_cols).copy()

# matrix for kmeans
X = df_clean[feature_cols].values

# run kmeans
k = 3
kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
kmeans.fit(X)

# add cluster label
df_clean["cluster"] = kmeans.labels_

# quick cluster profiles
cluster_profiles = (
    df_clean.groupby("cluster")[feature_cols]
    .mean()
    .round(3)
)

print("total inertia:", kmeans.inertia_)
print("\ncluster profiles:")
print(cluster_profiles)

print("\npreview:")
print(df_clean[["canton", "year", "cluster"] + feature_cols].head())

# main cluster per canton
canton_main_cluster = (
    df_clean.groupby("canton")["cluster"]
    .agg(lambda s: s.value_counts().idxmax())
    .reset_index()
    .sort_values("cluster")
)

print("\nmain cluster per canton:")
print(canton_main_cluster)

__all__ = ["df_clean"]


Loaded 285 rows
total inertia: 974.5589466570448

cluster profiles:
         Z_score_rent  avg_income_zscore  z-score_unemployment  \
cluster                                                          
0              -0.651             -0.632                 0.163   
1               1.089              1.152                -0.783   
2               0.129              0.017                 0.555   

         Z-score-ownrrate  Z-score-debt  shockexposure_zscore  
cluster                                                        
0                  -0.321        -0.491                 0.313  
1                  -0.158        -0.083                 0.179  
2                   0.861         1.133                -0.866  

preview:
  canton    year  cluster  Z_score_rent  avg_income_zscore  \
0     AG  2014.0        0          0.11              -0.44   
1     AG  2015.0        0         -0.08              -0.43   
2     AG  2016.0        0          0.02              -0.39   
3     AG  2017.0       