In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
import seaborn as sns

# 1. Load Data
df = pd.read_csv('DARWIN_standardized.csv')  # Replace with your file path/URL
print("Data shape:", df.shape)

In [None]:
# 2. Preprocessing
X = df.drop(columns=['class', 'ID']) if 'class' in df.columns else df.drop(columns=['ID'])
y = df['class'] if 'class' in df.columns else None

In [None]:
# 3. PCA for Visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# Plot PCA
plt.figure(figsize=(10, 6))
# Check if y is valid for hue plotting (exists and not all NaN)
if y is not None and not y.isnull().all():
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis')
else:
    # If y is not suitable for hue, plot without hue and palette
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1])
plt.title(f"PCA (Variance: PC1={pca.explained_variance_ratio_[0]:.1%}, PC2={pca.explained_variance_ratio_[1]:.1%})")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
# 4. K-Means Clustering
# Elbow Method
inertias = []
for k in range(1, 6):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca)
    inertias.append(kmeans.inertia_)

plt.figure(figsize=(8, 4))
plt.plot(range(1, 6), inertias, marker='o')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.show()

# Run K-Means (assuming K=2 for AD vs Control)
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# Evaluation
# Check if y exists and does not contain any NaN values before calculating adjusted_rand_score
if y is not None and not y.isnull().any():
    print(f"Adjusted Rand Index: {adjusted_rand_score(y, clusters):.3f}")
print(f"Silhouette Score: {silhouette_score(X_pca, clusters):.3f}")

# Plot Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='viridis')
plt.title("K-Means Clusters")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
# 5. Cluster Interpretation
# Inverse transform to original feature space (if needed)
if 'scaler' in locals():  # Only if StandardScaler was used
    cluster_centers_original = scaler.inverse_transform(pca.inverse_transform(kmeans.cluster_centers_))
else:
    cluster_centers_original = pca.inverse_transform(kmeans.cluster_centers_)

cluster_means = pd.DataFrame(cluster_centers_original, columns=X.columns)
print("\nCluster Centers (Top 5 Features):")
display(cluster_means.iloc[:, :5])  # Show first 5 features for brevity

# Feature Distributions by Cluster
df['cluster'] = clusters
plt.figure(figsize=(15, 5))
for i, col in enumerate(X.columns[:3]):  # Plot first 3 features
    plt.subplot(1, 3, i+1)
    sns.boxplot(x='cluster', y=col, data=df)
    plt.title(f"{col} by Cluster")
plt.tight_layout()
plt.show()