In [None]:
# 1. Setup & Data Loading

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
import seaborn as sns

# Load data
df = pd.read_csv('DARWIN_standardized.csv')  # Update path as needed
print("Initial shape:", df.shape)

ModuleNotFoundError: No module named 'pandas'

In [None]:
# 2. Preprocessing

# Separate features (X) and labels (y)
X = df.drop(columns=['class', 'ID'])  # All handwriting features
y = df['class']  # Alzheimer's labels (if available)

In [None]:
# 3. Dimensionality Reduction with PCA

# Reduce to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot PCA results
plt.figure(figsize=(10, 6))
if y is not None:
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis')
else:
    sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1])
plt.title("PCA of Handwriting Features")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)")
plt.show()

In [None]:
# 4. K-Means Clustering
# Determine optimal cluster count (Elbow Method)
inertias = []
for k in range(1, 6):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_pca)
    inertias.append(kmeans.inertia_)

plt.plot(range(1, 6), inertias, marker='o')
plt.title("Elbow Method for Optimal K")
plt.xlabel("Number of clusters")
plt.ylabel("Inertia")
plt.show()

# Run K-Means with selected K (2 for AD vs Control)
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# Add clusters back to dataframe
df['cluster'] = clusters

In [None]:
# 5. Evaluation
# If ground truth labels exist
if y is not None:
    print("Adjusted Rand Index:", adjusted_rand_score(y, clusters))
    
# Silhouette Score (always works)
print("Silhouette Score:", silhouette_score(X_pca, clusters))

# Visualize clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=clusters, palette='viridis')
plt.title("K-Means Clustering Results")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.show()

In [None]:
# 6. Interpret Clusters

# Compare cluster means in original feature space
cluster_means = pd.DataFrame(
    scaler.inverse_transform(pca.inverse_transform(kmeans.cluster_centers_)),
    columns=X.columns
)

print("Cluster Centers (Original Features):")
display(cluster_means)

# Plot key feature distributions by cluster
plt.figure(figsize=(15, 5))
for i, feature in enumerate(['air_time1', 'pressure1', 'mean_speed1']):
    plt.subplot(1, 3, i+1)
    sns.boxplot(x='cluster', y=feature, data=df)
    plt.title(f"{feature} by Cluster")
plt.tight_layout()
plt.show()

In [None]:
# ## 7. Save Results

df.to_csv('DARWIN_clustered.csv', index=False)
print("Results saved to DARWIN_clustered.csv")