In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import silhouette_score
import joblib

In [None]:
df = pd.read_csv("../data/heart_disease.csv")
X = df.drop("num", axis=1)
y = df["num"]     

In [None]:
inertia = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertia.append(kmeans.inertia_)

In [None]:
plt.figure(figsize=(8,5))
plt.plot(K, inertia, marker="o")
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Inertia")
plt.grid()
plt.show()

In [None]:
for k in [2, 3, 4]:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X)
    score = silhouette_score(X, labels)
    print(f"Silhouette Score for k={k}: {score:.3f}")

kmeans = KMeans(n_clusters=2, random_state=42)
df["Cluster_KMeans"] = kmeans.fit_predict(X)

hier = AgglomerativeClustering(n_clusters=2)
df["Cluster_Hierarchical"] = hier.fit_predict(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=df["Cluster_KMeans"], palette="Set1")
plt.title("KMeans Clusters (PCA reduced to 2D)")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=df["Cluster_Hierarchical"], palette="Set2")
plt.title("Hierarchical Clusters (PCA reduced to 2D)")
plt.show()

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("kmeans", KMeans(n_clusters=2, random_state=42))
])
pipeline.fit(X)

In [None]:
joblib.dump(pipeline, "../models/kmeans_model.pkl")