# Exploratory Analysis (GHQ-28 & KMeans)
This notebook demonstrates the Elbow method and KMeans clustering with `k=3` on GHQ-28 style data.


In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from pathlib import Path

data_path = Path('../../data/ghq28_responses.csv')
if data_path.exists():
    df = pd.read_csv(data_path)
else:
    # Synthetic fallback matching expected schema
    np.random.seed(42)
    n = 200
    ghq = pd.DataFrame({f'ghq{i}': np.random.randint(0,4,size=n) for i in range(1,29)})
    df = ghq

ghq_cols = [f'ghq{i}' for i in range(1,29)]
X = df[ghq_cols].to_numpy()

# Elbow method
wss = []
K = range(1,11)
for k in K:
    km = KMeans(n_clusters=k, n_init=10, random_state=123).fit(X)
    wss.append(km.inertia_)

plt.figure()
plt.plot(list(K), wss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Within-cluster sum of squares')
plt.show()

# KMeans with k=3
km3 = KMeans(n_clusters=3, n_init=50, random_state=123).fit(X)
labels = km3.labels_
df['cluster'] = labels + 1
df['ghq_total'] = df[ghq_cols].sum(axis=1)
df.groupby('cluster')['ghq_total'].mean()
