In [None]:
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [None]:
# Example: Simulate Data (replace with your own)
data = pd.DataFrame(np.random.rand(200, 5), columns=['AW', 'QW', 'NREM', 'IS', 'REM'])

In [None]:
# Perform UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=None) 
#n_neighbors=15: range 2 to 100. Controls the balance between local and global structure 
#n_components=2 : range 2 to 100.Reduces the data to 2D for visualization 
#min_dist=0.1: Controls the spread of points. Smaller values emphasize local structure

embedding = reducer.fit_transform(data)

# Convert to DataFrame for plotting
embedding_df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])

# Plotting the results
plt.figure(figsize=(4, 4))
sns.scatterplot(x='UMAP1', y='UMAP2', data=embedding_df, s=50, edgecolor='k')
plt.title('UMAP Projection of Cell Activity Data')
plt.xlabel('UMAP1')
plt.ylabel('UMAP2')
plt.show()

### HDBSCAN Clustering

In [None]:
# Perform UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=None)
embedding = reducer.fit_transform(data)

# Perform Clustering with HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, gen_min_span_tree=True)
labels = clusterer.fit_predict(embedding)

# Convert to DataFrame for plotting
embedding_df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])
embedding_df['Cluster'] = labels

# Plot Results
plt.figure(figsize=(4, 4))
sns.scatterplot(x='UMAP1', y='UMAP2', hue='Cluster', palette='tab20', data=embedding_df, s=50, edgecolor='k')
plt.title('UMAP Projection with HDBSCAN Clustering')
plt.legend(title='Cluster')
plt.show()

Evaluate clustering

In [None]:
if len(set(labels)) > 1 and -1 not in set(labels):
    score = silhouette_score(embedding, labels)
    print(f"Silhouette Score: {score:.2f}.  Notes: 1 → Well-clustered. 0 → Overlapping clusters. -1 → Misclassified samples.")
else:
    print("Silhouette Score cannot be calculated. Check for multiple clusters.")
    


if len(set(labels)) > 1 and -1 not in set(labels):
    db_score = davies_bouldin_score(embedding, labels)
    print(f"Davies-Bouldin Index: {db_score:.2f}.   Notes: Lower values indicate better clustering.")
else:
    print("Davies-Bouldin Index cannot be calculated. Check for multiple clusters.")


if len(set(labels)) > 1 and -1 not in set(labels):
    ch_score = calinski_harabasz_score(embedding, labels)
    print(f"Calinski-Harabasz Index: {ch_score:.2f}.    Notes: Higher values indicate better clustering.")
else:
    print("Calinski-Harabasz Index cannot be calculated. Check for multiple clusters.")

In [None]:
data['ClusterHDBSCAN'] = labels

### K-means clustering

In [None]:
# Perform UMAP
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2, random_state=None)
embedding = reducer.fit_transform(data)

# Perform Clustering with K-Means
kmeans = KMeans(n_clusters=5, random_state=None)
labels = kmeans.fit_predict(embedding)

# Visualize Clusters
embedding_df = pd.DataFrame(embedding, columns=['UMAP1', 'UMAP2'])
embedding_df['Cluster'] = labels

plt.figure(figsize=(4, 4))
sns.scatterplot(x='UMAP1', y='UMAP2', hue='Cluster', palette='tab10', data=embedding_df, s=50, edgecolor='k')
plt.title('UMAP Projection with K-Means Clustering')
plt.legend(title='Cluster')
plt.show()

Evaluate clustering

In [None]:
if len(set(labels)) > 1 and -1 not in set(labels):
    score = silhouette_score(embedding, labels)
    print(f"Silhouette Score: {score:.2f}.  Notes: 1 → Well-clustered. 0 → Overlapping clusters. -1 → Misclassified samples.")
else:
    print("Silhouette Score cannot be calculated. Check for multiple clusters.")
    
    
if len(set(labels)) > 1 and -1 not in set(labels):
    db_score = davies_bouldin_score(embedding, labels)
    print(f"Davies-Bouldin Index: {db_score:.2f}.   Notes: Lower values indicate better clustering.")
else:
    print("Davies-Bouldin Index cannot be calculated. Check for multiple clusters.")


if len(set(labels)) > 1 and -1 not in set(labels):
    ch_score = calinski_harabasz_score(embedding, labels)
    print(f"Calinski-Harabasz Index: {ch_score:.2f}.    Notes: Higher values indicate better clustering.")
else:
    print("Calinski-Harabasz Index cannot be calculated. Check for multiple clusters.")

In [None]:
data['ClusterKmeans'] = labels
print(data.head())

### Spectral clustering in 3D

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import SpectralClustering


# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(data)

# Apply Spectral Clustering
n_clusters = 5
spectral = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors', random_state=None)

# Plot the clustered data in 3D
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection='3d')

# Plot data points
scatter = ax.scatter(data['AW'], data['NREM'], data['REM'],
                     c=data['Cluster'], s=20, cmap='plasma', alpha=0.6, edgecolor='k')

# Plot cluster centers (approximation)
centers = np.array([data[data['Cluster'] == i][['AW', 'QW', 'NREM', 'REM', 'IS']].mean().values for i in range(n_clusters)])
ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], 
           c='red', s=100, alpha=0.9, marker='x', label='Centroids')

# Add labels and title
ax.set_xlabel('Wake')
ax.set_ylabel('NREM')
ax.set_zlabel('REM')
ax.set_title('Spectral Clustering in 3D')
ax.legend()
plt.tight_layout()

ax.view_init(30, 10)
ax.set_xlim(0, 1)  # Example limits for x-axis
ax.set_ylim(0, 1)  # Example limits for y-axis
ax.set_zlim(0, 1)  # Example limits for z-axis

# Show the plot
plt.show()

In [None]:
labels = spectral.fit_predict(data)
silhouette_avg = silhouette_score(data[['Wake', 'NREM', 'REM']], labels)
print(f'Silhouette Score: {silhouette_avg:.2f}') #scores range from -1 to +1, with higher values indicating better-defined clusters.

In [None]:
data['SpectralCluster'] = spectral.fit_predict(X_scaled)