In [None]:
# velocitymask_clustering_statsbased_1000samples.ipynb
# Jupyter notebook content for clustering 1000 velocity samples based on 100 statistical+temporal features

import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import umap.umap_ as umap
import seaborn as sns

In [None]:

# Paths to precomputed feature files
feature_file = r"Y:\Projects\DeepFlow\deepFlowDocker\scripts\Registration\output/velocity_features_patch_1000.npy"
id_file = r"Y:\Projects\DeepFlow\deepFlowDocker\scripts\Registration\output/velocity_ids_patch_1000.txt"

# Load features and IDs
X = np.load(feature_file)
with open(id_file, "r") as f:
    ids = [line.strip() for line in f.readlines()]

print("Loaded feature matrix shape:", X.shape)

In [None]:

# PCA for dimensionality reduction
pca = PCA(n_components=10, random_state=42)
X_pca = pca.fit_transform(X)
print("Explained variance ratio (top 5):", pca.explained_variance_ratio_[:5])


In [None]:

# KMeans clustering
kmeans = KMeans(n_clusters=4, random_state=42)
labels = kmeans.fit_predict(X_pca)

# Silhouette Score
score = silhouette_score(X_pca, labels)
print("Silhouette Score:", round(score, 4))



In [None]:
# UMAP projection
reducer = umap.UMAP(random_state=42)
X_umap = reducer.fit_transform(X_pca)

# Plot
plt.figure(figsize=(7, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=labels, cmap='tab10', s=40)
plt.title("UMAP of 1000 Samples (Statistical + Temporal Features)")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Save clustering results
df = pd.DataFrame(X)
df['cluster'] = labels
# Basic stats per cluster
print("\n=== Per-cluster feature means (first 10 features) ===")
print(df.groupby('cluster').mean().iloc[:, :10].round(2))

# Visualize selected features by cluster
selected_features = [0, 1, 2, 90, 91, 92]  # e.g. mean/std/skew of frame 0 + derivatives
df_melt = df[[f for f in df.columns if isinstance(f, int) and f in selected_features] + ['cluster']]
df_melt = df_melt.rename(columns={i: f'F{i}' for i in selected_features})
df_melt = df_melt.melt(id_vars='cluster', var_name='feature', value_name='value')

plt.figure(figsize=(10, 6))
sns.boxplot(data=df_melt, x='feature', y='value', hue='cluster')
plt.title("Feature Distributions by Cluster")
plt.tight_layout()
plt.show()

In [None]:
# ANOVA tests
from scipy.stats import f_oneway
print("\n=== ANOVA Results ===")
for feat in selected_features:
    groups = [df[df['cluster'] == c][feat].values for c in sorted(df['cluster'].unique())]
    f_stat, p_val = f_oneway(*groups)
    print(f"Feature {feat:>3}: F = {f_stat:.3f}, p = {p_val:.4e}")

In [None]:
# Focused clustering using top discriminative features
focused_features = [0, 1, 90, 91, 92]
X_focus = df[focused_features].values

pca_focus = PCA(n_components=2, random_state=42)
X_focus_pca = pca_focus.fit_transform(X_focus)

kmeans_focus = KMeans(n_clusters=3, random_state=42)
labels_focus = kmeans_focus.fit_predict(X_focus_pca)

score_focus = silhouette_score(X_focus_pca, labels_focus)
print("Focused Silhouette Score:", round(score_focus, 4))

X_focus_umap = umap.UMAP(random_state=42).fit_transform(X_focus_pca)

plt.figure(figsize=(7, 6))
plt.scatter(X_focus_umap[:, 0], X_focus_umap[:, 1], c=labels_focus, cmap='tab10', s=40)
plt.title("UMAP (Focused on Derivative Features)")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.feature_selection import f_classif

# Compute F-scores between features and cluster labels
X_raw = df.drop(columns='cluster').values
f_scores, _ = f_classif(X_raw, df['cluster'].values)

# Select indices of top 15 features
top15_indices = np.argsort(f_scores)[-15:]

# Subset the data
X_top = X_raw[:, top15_indices]

# Clustering
pca_top = PCA(n_components=2, random_state=42)
X_top_pca = pca_top.fit_transform(X_top)

kmeans_top = KMeans(n_clusters=3, random_state=42)
labels_top = kmeans_top.fit_predict(X_top_pca)

score_top = silhouette_score(X_top_pca, labels_top)
print("Top 15 feature Silhouette Score:", round(score_top, 4))

# UMAP projection
X_top_umap = umap.UMAP(random_state=42).fit_transform(X_top_pca)

# Plot
plt.figure(figsize=(7, 6))
plt.scatter(X_top_umap[:, 0], X_top_umap[:, 1], c=labels_top, cmap='tab10', s=40)
plt.title("UMAP (Top 15 Features by ANOVA F-score)")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# Save results
df = pd.DataFrame({
    "sample_id": ids,
    "cluster": labels
})
df.to_csv("../output/velocity_clustering_statsbased_results.csv", index=False)
print("Saved clustering results to ../output/velocity_clustering_statsbased_results.csv")
