In [None]:

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

df = pd.read_csv("../data/heart_disease.csv")
if "target" not in df.columns and "num" in df.columns:
    df["target"] = (df["num"] > 0).astype(int)
    df.drop(columns=["num"], inplace=True)

X = df.drop(columns=["target"])
num = X.select_dtypes("number").columns.tolist()
cat = [c for c in X.columns if c not in num]

pre = ColumnTransformer([
    ("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), num),
    ("cat", Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat)
])
Xpre = pre.fit_transform(X)


km = KMeans(n_clusters=2, random_state=42, n_init=10).fit(Xpre)
print("Cluster counts:", {i: int((km.labels_==i).sum()) for i in range(2)})


In [None]:


try:
    from scipy.cluster.hierarchy import dendrogram, linkage
    import numpy as np
    import matplotlib.pyplot as plt

    sample_n = min(120, Xpre.shape[0])
    Xs = Xpre[:sample_n].toarray() if hasattr(Xpre, "toarray") else Xpre[:sample_n]
    Z = linkage(Xs, method="ward")
    plt.figure(figsize=(10, 5))
    dendrogram(Z, truncate_mode="level", p=5)
    plt.title("Hierarchical Clustering — Truncated Dendrogram")
    plt.show()
except Exception as e:
    print("SciPy not available or dendrogram failed:", e)
