# DSS data UMAP and K-means clustering

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from umap import UMAP


warnings.filterwarnings("ignore")

In [None]:
# loaidng the data 

dss_imputed = pd.read_csv('imputed_DSS_data.csv')


In [None]:
# shifting all the data points by 1 to make all values positive

shift_value = np.abs(np.min(dss_imputed)) + 1 
shifted_data = dss_imputed + shift_value  

# applying log transformation to correct skewness

log_data = np.log1p(shifted_data)

# applying Min-Max scaling to ensure equal contribution to Euclidean distance calculations

min_max_scaler = MinMaxScaler()
min_max_scaled_data = min_max_scaler.fit_transform(log_data)

scaled_df = pd.DataFrame(min_max_scaled_data, index=dss_imputed.index, columns=dss_imputed.columns)
scaled_df

### K-means clustering - UMAP dim reduction

In [None]:
# finding optimal number of k 


# UMPA dim reduction

umap_embedding = UMAP(n_components=2, 
                      n_neighbors=4, 
                      min_dist=0.2, 
                      metric="euclidean", 
                      random_state=42).fit_transform(scaled_df)


silhouette_scores = []

for k in range(2, 11):
    
    kmeans = KMeans(n_clusters=k, random_state=42)
    
    kmeans.fit(umap_embedding)
    
    # predicting cluster labels
    
    labels = kmeans.labels_
    
    # Silhouette Score
    
    silhouette_avg = silhouette_score(umap_embedding, labels)
    
    silhouette_scores.append(silhouette_avg)
    
    print(f"Silhouette Score for k={k}: {silhouette_avg}")

In [None]:
# k = 2 gives the best result 

num_clusters = 2

# clustering the UMAP embeddings using KMeans

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(umap_embedding)


In [None]:
# adding cluster names to the original DataFrame

clusters = scaled_df.copy()
clusters['Cluster'] = ['Group_' + str(label + 1) for label in cluster_labels]


In [None]:
# plotting the clustered data with cluster labels

plt.figure(figsize=(10, 6))
for cluster_label in range(num_clusters):
    cluster_indices = umap_embedding[cluster_labels == cluster_label]
    plt.scatter(cluster_indices[:, 0], cluster_indices[:, 1], label=f'Group {cluster_label + 1}')

    # annotating each cluster with its label
    cluster_center = cluster_indices.mean(axis=0)
    plt.text(cluster_center[0], cluster_center[1], f'Group {cluster_label + 1}', fontsize=12, ha='center', va='center')

plt.title('Clustering of the DSS data')
plt.xlabel('UMAP Dimension 1')
plt.ylabel('UMAP Dimension 2')
plt.legend()
plt.show()
