In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [8]:
base_df = pd.read_csv('./microstates/results_separateData.csv')
base_df['activity'] = base_df['activity'].str.lower()
base_df

Unnamed: 0,name,activity,imagining,probability_A,probability_B,probability_C,probability_D,time_A,time_B,time_C,...,P_D_C,P_D_D,GEV_total,GEV_A,GEV_B,GEV_C,GEV_D,max_entropy,entropy,mc_entropy
0,P19,successful,guided,0.334737,0.250526,0.114947,0.299789,12.720,9.520,4.368,...,0.009480,0.928020,0.625059,0.208141,0.231092,0.087980,0.097846,2.0,1.908582,0.467345
1,P19,successful,self-guided,0.250867,0.314267,0.208333,0.226533,15.052,18.856,12.500,...,0.021778,0.912890,0.759168,0.162572,0.383593,0.130449,0.082553,2.0,1.982033,0.524097
2,P19,training,guided,0.296865,0.097189,0.238486,0.367459,10.984,3.596,8.824,...,0.018241,0.948220,0.683006,0.195897,0.078354,0.193590,0.215166,2.0,1.870929,0.370989
3,P19,training,self-guided,0.250200,0.221067,0.250867,0.277867,15.012,13.264,15.052,...,0.019194,0.929463,0.690942,0.177679,0.128834,0.264577,0.119851,2.0,1.995322,0.506433
4,P19,slow,guided,0.184766,0.358298,0.152085,0.304851,8.684,16.840,7.148,...,0.015634,0.927694,0.496745,0.117303,0.183319,0.078563,0.117561,2.0,1.916372,0.451174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,P20,start,self-guided,0.244000,0.257043,0.195043,0.303913,11.224,11.824,8.972,...,0.030901,0.903004,0.767544,0.176991,0.271176,0.062497,0.256879,2.0,1.982478,0.660350
356,P20,fitness,guided,0.330703,0.315459,0.169297,0.184541,12.236,11.672,6.264,...,0.019918,0.878149,0.766467,0.319659,0.311013,0.043526,0.092270,2.0,1.936716,0.623212
357,P20,fitness,self-guided,0.347267,0.280000,0.120200,0.252533,20.836,16.800,7.212,...,0.009770,0.894375,0.805037,0.392133,0.248939,0.028345,0.135620,2.0,1.912893,0.626463
358,P20,your,guided,0.267800,0.352000,0.148500,0.231700,10.712,14.080,5.940,...,0.013379,0.889944,0.770506,0.248660,0.333379,0.035835,0.152632,2.0,1.936666,0.644831


In [9]:
categorical_columns = base_df.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(base_df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([base_df, one_hot_df], axis=1)

df_encoded = df_encoded.drop(categorical_columns, axis=1)

In [18]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(df_encoded)

In [11]:
import plotly.express as px

# Calculate the correlation matrix
correlation_matrix = df_encoded.corr()

# Create the heatmap
fig = px.imshow(correlation_matrix, 
                labels=dict(color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',
                zmin=-1, zmax=1)

# Update layout for better readability
fig.update_layout(
    title='Correlation Matrix Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    width=1000,
    height=800
)

# Show the plot
fig.show()

In [12]:
# Split the dataset based on the value of the 'imagining' column
df_guided = base_df[base_df['imagining'] == 'guided']
df_self_guided = base_df[base_df['imagining'] == 'self-guided']

df_guided.drop('imagining', axis=1, inplace=True)
df_self_guided.drop('imagining', axis=1, inplace=True)

df_guided.reset_index(drop=True, inplace=True)
df_self_guided.reset_index(drop=True, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## K-Means

In [13]:
import plotly.graph_objects as go

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_encoded)

# Elbow Method
wcss = []
for i in range(1, 21):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

# Create the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, 21)), y=wcss, mode='lines+markers', marker=dict(color='red'), line=dict(dash='dash')))

# Update layout for better readability
fig.update_layout(
    title='Elbow Method',
    xaxis_title='Number of clusters',
    yaxis_title='Inertia',
    width=1000,
    height=600
)

# Show the plot
fig.show()

In [14]:
import plotly.graph_objects as go

# Silhouette Analysis
silhouette_scores = []
for n_clusters in range(2, 21):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_data)
    silhouette_avg = silhouette_score(scaled_data, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Create the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(2, 21)), y=silhouette_scores, mode='lines+markers', marker=dict(color='red'), line=dict(dash='dash')))

# Update layout for better readability
fig.update_layout(
    title='Silhouette Analysis',
    xaxis_title='Number of clusters',
    yaxis_title='Silhouette Score',
    width=1000,
    height=600
)

# Show the plot
fig.show()


In [15]:
kmeans = KMeans(n_clusters = 8, init='k-means++')
kmeans.fit(scaled_data)
pred = kmeans.predict(scaled_data)

frame = pd.DataFrame(scaled_data)
frame['cluster'] = pred
frame['cluster'].value_counts()

cluster
1    68
5    60
4    56
2    47
0    46
3    39
6    32
7    12
Name: count, dtype: int64

In [16]:
import plotly.express as px

# Create a DataFrame for the scaled data with cluster labels
df_scaled = pd.DataFrame(scaled_data, columns=[f'feature_{i}' for i in range(scaled_data.shape[1])])
df_scaled['cluster'] = pred

# Create a scatter plot for the first two features
fig = px.scatter(df_scaled, x='feature_0', y='feature_1', color='cluster', title='Cluster Visualization')
fig.show()

In [22]:
# Add the cluster labels to the original dataframe
df_scaled['cluster'] = pred

df_scaled['cluster'].value_counts()

cluster
1    68
5    60
4    56
2    47
0    46
3    39
6    32
7    12
Name: count, dtype: int64

In [24]:
X, y = df_scaled.drop('cluster', axis=1), df_scaled['cluster']

In [25]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

In [28]:
import plotly.express as px

# Create a DataFrame for the PCA components with cluster labels
df_pca = pd.DataFrame(X_pca, columns=['PCA Component 1', 'PCA Component 2'])
df_pca['cluster'] = pred

# Create a scatter plot for the PCA components
fig = px.scatter(df_pca, x='PCA Component 1', y='PCA Component 2', color='cluster', title='Clusters visualized with PCA')
fig.show()

In [29]:
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X)

In [30]:
import plotly.express as px

# Create a DataFrame for the PCA components with cluster labels
df_tsne = pd.DataFrame(X_tsne, columns=['PCA Component 1', 'PCA Component 2'])
df_tsne['cluster'] = pred

# Create a scatter plot for the PCA components
fig = px.scatter(df_tsne, x='PCA Component 1', y='PCA Component 2', color='cluster', title='Clusters visualized with tSNE')
fig.show()

In [31]:
sil_score = silhouette_score(X, pred)
print(f"Silhouette Score: {sil_score}")

Silhouette Score: 0.06748468160048139


In [34]:
db_score = davies_bouldin_score(X, pred)
print(f"Davies-Bouldin Index: {db_score}")

Davies-Bouldin Index: 2.7432846030110665


In [35]:
ch_score = calinski_harabasz_score(X, pred)
print(f"Calinski-Harabasz Index: {ch_score}")

Calinski-Harabasz Index: 18.02670096586827


In [None]:
ari_score = adjusted_rand_score(y, pred)
print(f"Adjusted Rand Index: {ari_score}")