## KMEANS ##

In [10]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from pathlib import Path

FEATURE_COLS = [
    "Z_score_rent",
    "avg_income_zscore",
    "z-score_unemployment",
    "Z-score-ownrrate",
    "Z-score-debt",
    "shockexposure_zscore"
]

def load_data(path="/Users/maximeducotterd/Desktop/DSAP_intercantonal_dynamics/data/databasecsv.csv"):
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"Dataset not found at {path.resolve()}")
        
    df = pd.read_csv(path, sep=";")
    df.columns = df.columns.str.strip()
    return df

def prepare_matrix(df, feature_cols=FEATURE_COLS):
    missing = [c for c in feature_cols if c not in df.columns]
    if missing:
        raise KeyError(f"Missing required columns: {missing}")

    df_clean = df.dropna(subset=feature_cols).copy()
    X = df_clean[feature_cols].apply(pd.to_numeric, errors="coerce")
    if X.isna().any().any():
        raise ValueError("NaN detected after conversion to numeric.")
    return df_clean, X.to_numpy()

def run_kmeans(X, k=3, random_state=0):
    return KMeans(n_clusters=k, random_state=random_state, n_init=10).fit(X)

def assign_clusters(df_clean, model):
    df_out = df_clean.copy()
    df_out["cluster"] = model.labels_
    return df_out

df = load_data()
df_clean, X = prepare_matrix(df)
model = run_kmeans(X, k=3)
df_clustered = assign_clusters(df_clean, model)

print("\n=== TOTAL INERTIA ===")
print(model.inertia_)

print("\n=== CLUSTER PROFILES ===")
profiles = df_clustered.groupby("cluster")[FEATURE_COLS].mean().round(3)
print(profiles)

print("\n=== FIRST ROWS WITH CLUSTERS ===")
cols_show = ["canton", "year", "cluster"] + FEATURE_COLS
print(df_clustered[cols_show].head())

print("\n=== MAIN CLUSTER PER CANTON ===")
canton_main = (
    df_clustered.groupby("canton")["cluster"]
    .agg(lambda s: s.value_counts().idxmax())
    .reset_index()
    .sort_values("cluster")
)
print(canton_main)




=== TOTAL INERTIA ===
974.5589466570448

=== CLUSTER PROFILES ===
         Z_score_rent  avg_income_zscore  z-score_unemployment  \
cluster                                                          
0              -0.651             -0.632                 0.163   
1               1.089              1.152                -0.783   
2               0.129              0.017                 0.555   

         Z-score-ownrrate  Z-score-debt  shockexposure_zscore  
cluster                                                        
0                  -0.321        -0.491                 0.313  
1                  -0.158        -0.083                 0.179  
2                   0.861         1.133                -0.866  

=== FIRST ROWS WITH CLUSTERS ===
  canton    year  cluster  Z_score_rent  avg_income_zscore  \
0     AG  2014.0        0          0.11              -0.44   
1     AG  2015.0        0         -0.08              -0.43   
2     AG  2016.0        0          0.02              -0.39   
