In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [5]:
base_df = pd.read_csv('./microstates/results_separateData.csv')

In [6]:
base_df['activity'] = base_df['activity'].str.lower()

In [7]:
base_df

Unnamed: 0,name,activity,imagining,probability_A,probability_B,probability_C,probability_D,time_A,time_B,time_C,...,P_D_C,P_D_D,GEV_total,GEV_A,GEV_B,GEV_C,GEV_D,max_entropy,entropy,mc_entropy
0,P19,successful,guided,0.334737,0.250526,0.114947,0.299789,12.720,9.520,4.368,...,0.009480,0.928020,0.625059,0.208141,0.231092,0.087980,0.097846,2.0,1.908582,0.467345
1,P19,successful,self-guided,0.250867,0.314267,0.208333,0.226533,15.052,18.856,12.500,...,0.021778,0.912890,0.759168,0.162572,0.383593,0.130449,0.082553,2.0,1.982033,0.524097
2,P19,training,guided,0.296865,0.097189,0.238486,0.367459,10.984,3.596,8.824,...,0.018241,0.948220,0.683006,0.195897,0.078354,0.193590,0.215166,2.0,1.870929,0.370989
3,P19,training,self-guided,0.250200,0.221067,0.250867,0.277867,15.012,13.264,15.052,...,0.019194,0.929463,0.690942,0.177679,0.128834,0.264577,0.119851,2.0,1.995322,0.506433
4,P19,slow,guided,0.184766,0.358298,0.152085,0.304851,8.684,16.840,7.148,...,0.015634,0.927694,0.496745,0.117303,0.183319,0.078563,0.117561,2.0,1.916372,0.451174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,P20,start,self-guided,0.244000,0.257043,0.195043,0.303913,11.224,11.824,8.972,...,0.030901,0.903004,0.767544,0.176991,0.271176,0.062497,0.256879,2.0,1.982478,0.660350
356,P20,fitness,guided,0.330703,0.315459,0.169297,0.184541,12.236,11.672,6.264,...,0.019918,0.878149,0.766467,0.319659,0.311013,0.043526,0.092270,2.0,1.936716,0.623212
357,P20,fitness,self-guided,0.347267,0.280000,0.120200,0.252533,20.836,16.800,7.212,...,0.009770,0.894375,0.805037,0.392133,0.248939,0.028345,0.135620,2.0,1.912893,0.626463
358,P20,your,guided,0.267800,0.352000,0.148500,0.231700,10.712,14.080,5.940,...,0.013379,0.889944,0.770506,0.248660,0.333379,0.035835,0.152632,2.0,1.936666,0.644831


In [8]:
categorical_columns = base_df.select_dtypes(include=['object']).columns.tolist()
encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = encoder.fit_transform(base_df[categorical_columns])
one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_columns))
df_encoded = pd.concat([base_df, one_hot_df], axis=1)

df_encoded = df_encoded.drop(categorical_columns, axis=1)

In [9]:
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     display(df_encoded)

In [10]:
import plotly.express as px

# Calculate the correlation matrix
correlation_matrix = df_encoded.corr()

# Create the heatmap
fig = px.imshow(correlation_matrix, 
                labels=dict(color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',
                zmin=-1, zmax=1)

# Update layout for better readability
fig.update_layout(
    title='Correlation Matrix Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    width=1000,
    height=800
)

# Show the plot
fig.show()

## K-Means

In [11]:
import plotly.graph_objects as go

# Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_encoded)

# Elbow Method
wcss = []
for i in range(1, 21):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

# Create the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, 21)), y=wcss, mode='lines+markers', marker=dict(color='red'), line=dict(dash='dash')))

# Update layout for better readability
fig.update_layout(
    title='Elbow Method',
    xaxis_title='Number of clusters',
    yaxis_title='Inertia',
    width=1000,
    height=600
)

# Show the plot
fig.show()

In [12]:
import plotly.graph_objects as go

# Silhouette Analysis
silhouette_scores = []
for n_clusters in range(2, 21):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_data)
    silhouette_avg = silhouette_score(scaled_data, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Create the plot
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(2, 21)), y=silhouette_scores, mode='lines+markers', marker=dict(color='red'), line=dict(dash='dash')))

# Update layout for better readability
fig.update_layout(
    title='Silhouette Analysis',
    xaxis_title='Number of clusters',
    yaxis_title='Silhouette Score',
    width=1000,
    height=600
)

# Show the plot
fig.show()


In [13]:
kmeans = KMeans(n_clusters = 8, init='k-means++')
kmeans.fit(scaled_data)
pred = kmeans.predict(scaled_data)

frame = pd.DataFrame(scaled_data)
frame['cluster'] = pred
frame['cluster'].value_counts()

cluster
3    77
4    62
2    60
7    43
1    39
0    32
6    32
5    15
Name: count, dtype: int64

In [14]:
import plotly.express as px

# Create a DataFrame for the scaled data with cluster labels
df_scaled = pd.DataFrame(scaled_data, columns=[f'feature_{i}' for i in range(scaled_data.shape[1])])
df_scaled['cluster'] = pred

# Create a scatter plot for the first two features
fig = px.scatter(df_scaled, x='feature_0', y='feature_1', color='cluster', title='Cluster Visualization')
fig.show()

## Split dataset by imagining value

In [15]:
base_df

Unnamed: 0,name,activity,imagining,probability_A,probability_B,probability_C,probability_D,time_A,time_B,time_C,...,P_D_C,P_D_D,GEV_total,GEV_A,GEV_B,GEV_C,GEV_D,max_entropy,entropy,mc_entropy
0,P19,successful,guided,0.334737,0.250526,0.114947,0.299789,12.720,9.520,4.368,...,0.009480,0.928020,0.625059,0.208141,0.231092,0.087980,0.097846,2.0,1.908582,0.467345
1,P19,successful,self-guided,0.250867,0.314267,0.208333,0.226533,15.052,18.856,12.500,...,0.021778,0.912890,0.759168,0.162572,0.383593,0.130449,0.082553,2.0,1.982033,0.524097
2,P19,training,guided,0.296865,0.097189,0.238486,0.367459,10.984,3.596,8.824,...,0.018241,0.948220,0.683006,0.195897,0.078354,0.193590,0.215166,2.0,1.870929,0.370989
3,P19,training,self-guided,0.250200,0.221067,0.250867,0.277867,15.012,13.264,15.052,...,0.019194,0.929463,0.690942,0.177679,0.128834,0.264577,0.119851,2.0,1.995322,0.506433
4,P19,slow,guided,0.184766,0.358298,0.152085,0.304851,8.684,16.840,7.148,...,0.015634,0.927694,0.496745,0.117303,0.183319,0.078563,0.117561,2.0,1.916372,0.451174
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,P20,start,self-guided,0.244000,0.257043,0.195043,0.303913,11.224,11.824,8.972,...,0.030901,0.903004,0.767544,0.176991,0.271176,0.062497,0.256879,2.0,1.982478,0.660350
356,P20,fitness,guided,0.330703,0.315459,0.169297,0.184541,12.236,11.672,6.264,...,0.019918,0.878149,0.766467,0.319659,0.311013,0.043526,0.092270,2.0,1.936716,0.623212
357,P20,fitness,self-guided,0.347267,0.280000,0.120200,0.252533,20.836,16.800,7.212,...,0.009770,0.894375,0.805037,0.392133,0.248939,0.028345,0.135620,2.0,1.912893,0.626463
358,P20,your,guided,0.267800,0.352000,0.148500,0.231700,10.712,14.080,5.940,...,0.013379,0.889944,0.770506,0.248660,0.333379,0.035835,0.152632,2.0,1.936666,0.644831


In [16]:
base_df['activity'].unique()

array(['successful', 'training', 'slow', 'start', 'fitness', 'your'],
      dtype=object)

In [25]:
# Split the dataset based on the value of the 'imagining' column
df_guided = base_df[base_df['imagining'] == 'guided']
df_self_guided = base_df[base_df['imagining'] == 'self-guided']

df_guided.drop('imagining', axis=1, inplace=True)
df_self_guided.drop('imagining', axis=1, inplace=True)

df_guided.reset_index(drop=True, inplace=True)
df_self_guided.reset_index(drop=True, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [26]:
df_guided_successful = df_guided[df_guided['activity'] == 'successful']
df_guided_training = df_guided[df_guided['activity'] == 'training']
df_guided_slow = df_guided[df_guided['activity'] == 'slow']
df_guided_start = df_guided[df_guided['activity'] == 'start']
df_guided_fitness = df_guided[df_guided['activity'] == 'fitness']
df_guided_your = df_guided[df_guided['activity'] == 'your']

df_self_guided_successful = df_self_guided[df_self_guided['activity'] == 'successful']
df_self_guided_training = df_self_guided[df_self_guided['activity'] == 'training']
df_self_guided_slow = df_self_guided[df_self_guided['activity'] == 'slow']
df_self_guided_start = df_self_guided[df_self_guided['activity'] == 'start']
df_self_guided_fitness = df_self_guided[df_self_guided['activity'] == 'fitness']
df_self_guided_your = df_self_guided[df_self_guided['activity'] == 'your']

df_guided_successful.drop('activity', axis=1, inplace=True)
df_guided_training.drop('activity', axis=1, inplace=True)
df_guided_slow.drop('activity', axis=1, inplace=True)
df_guided_start.drop('activity', axis=1, inplace=True)
df_guided_fitness.drop('activity', axis=1, inplace=True)
df_guided_your.drop('activity', axis=1, inplace=True)
df_self_guided_successful.drop('activity', axis=1, inplace=True)
df_self_guided_training.drop('activity', axis=1, inplace=True)
df_self_guided_slow.drop('activity', axis=1, inplace=True)
df_self_guided_start.drop('activity', axis=1, inplace=True)
df_self_guided_fitness.drop('activity', axis=1, inplace=True)
df_self_guided_your.drop('activity', axis=1, inplace=True)


df_guided_successful.reset_index(drop=True, inplace=True)
df_guided_training.reset_index(drop=True, inplace=True)
df_guided_slow.reset_index(drop=True, inplace=True)
df_guided_start.reset_index(drop=True, inplace=True)
df_guided_fitness.reset_index(drop=True, inplace=True)
df_guided_your.reset_index(drop=True, inplace=True)
df_self_guided_successful.reset_index(drop=True, inplace=True)
df_self_guided_training.reset_index(drop=True, inplace=True)
df_self_guided_slow.reset_index(drop=True, inplace=True)
df_self_guided_start.reset_index(drop=True, inplace=True)
df_self_guided_fitness.reset_index(drop=True, inplace=True)
df_self_guided_your.reset_index(drop=True, inplace=True)




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

## Compare corr by imagining

### Guided

In [27]:
categorical_columns_guided = df_guided.select_dtypes(include=['object']).columns.tolist()
encoder_gd = OneHotEncoder(sparse_output=False)

one_hot_encoded_gd = encoder_gd.fit_transform(df_guided[categorical_columns_guided])
one_hot_df_guided = pd.DataFrame(one_hot_encoded_gd, columns=encoder_gd.get_feature_names_out(categorical_columns_guided))
df_encoded_guided = pd.concat([df_guided, one_hot_df_guided], axis=1)

df_encoded_guided = df_encoded_guided.drop(categorical_columns_guided, axis=1)

In [33]:
import plotly.express as px

# Calculate the correlation matrix
correlation_matrix_guided = df_encoded_guided.corr()

# Create the heatmap
fig = px.imshow(correlation_matrix, 
                labels=dict(color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',
                zmin=-1, zmax=1)

# Update layout for better readability
fig.update_layout(
    title='Correlation Matrix Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    width=1000,
    height=800
)

# Show the plot
fig.show()

### Self-guided

In [34]:
categorical_columns_self_guided = df_self_guided.select_dtypes(include=['object']).columns.tolist()
encoder_sgd = OneHotEncoder(sparse_output=False)

one_hot_encoded_sgd = encoder_sgd.fit_transform(df_self_guided[categorical_columns_self_guided])
one_hot_df_self_guided = pd.DataFrame(one_hot_encoded_sgd, columns=encoder_sgd.get_feature_names_out(categorical_columns_self_guided))
df_encoded_self_guided = pd.concat([df_self_guided, one_hot_df_self_guided], axis=1)

df_encoded_self_guided = df_encoded_self_guided.drop(categorical_columns_self_guided, axis=1)

In [35]:
import plotly.express as px

# Calculate the correlation matrix
correlation_matrix_self_guided = df_encoded_self_guided.corr()

# Create the heatmap
fig = px.imshow(correlation_matrix, 
                labels=dict(color="Correlation"),
                x=correlation_matrix.columns,
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r',
                zmin=-1, zmax=1)

# Update layout for better readability
fig.update_layout(
    title='Correlation Matrix Heatmap',
    xaxis_title='Features',
    yaxis_title='Features',
    width=1000,
    height=800
)

# Show the plot
fig.show()

In [None]:
# Calculate the absolute difference between the correlation matrices
correlation_diff = abs(correlation_matrix_guided - correlation_matrix_self_guided)

# Display the absolute difference matrix
correlation_diff

Unnamed: 0,probability_A,probability_B,probability_C,probability_D,time_A,time_B,time_C,time_D,A_min,A_max,...,name_P27,name_P28,name_P29,name_P30,activity_fitness,activity_slow,activity_start,activity_successful,activity_training,activity_your
probability_A,0.000000,0.118581,0.122963,0.187787,0.004248,0.105085,0.123828,0.177212,0.073801,0.042322,...,0.042842,0.036042,0.084381,0.016857,0.044337,0.031964,0.043448,0.005136,0.042334,0.008351
probability_B,0.118581,0.000000,0.117728,0.065155,0.139395,0.013360,0.079328,0.040012,0.117202,0.110011,...,0.034627,0.051432,0.066254,0.062265,0.169230,0.110463,0.133979,0.053057,0.038278,0.101411
probability_C,0.122963,0.117728,0.000000,0.015509,0.108562,0.087796,0.013832,0.023520,0.152182,0.047057,...,0.166969,0.082876,0.059359,0.063526,0.013567,0.059553,0.082789,0.033805,0.044670,0.115278
probability_D,0.187787,0.065155,0.015509,0.000000,0.153197,0.089158,0.058645,0.007592,0.091199,0.069540,...,0.147567,0.067251,0.036736,0.013737,0.185655,0.073679,0.114274,0.089706,0.128107,0.008436
time_A,0.004248,0.139395,0.108562,0.153197,0.000000,0.120632,0.114655,0.157343,0.081568,0.032632,...,0.035814,0.024235,0.090118,0.015023,0.108438,0.058671,0.450748,0.133300,0.185838,0.081842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
activity_slow,0.031964,0.110463,0.059553,0.073679,0.058671,0.009632,0.180692,0.172054,0.030185,0.221588,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
activity_start,0.043448,0.133979,0.082789,0.114274,0.450748,0.504461,0.292901,0.319340,0.105070,0.212574,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
activity_successful,0.005136,0.053057,0.033805,0.089706,0.133300,0.168425,0.157017,0.048301,0.014453,0.024260,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
activity_training,0.042334,0.038278,0.044670,0.128107,0.185838,0.178828,0.196150,0.044068,0.014453,0.027917,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [37]:
import plotly.express as px

# Create the heatmap for the absolute difference between the correlation matrices
fig = px.imshow(correlation_diff, 
                labels=dict(color="Absolute Correlation Difference"),
                x=correlation_diff.columns,
                y=correlation_diff.index,
                color_continuous_scale='RdBu_r',
                zmin=0, zmax=1)

# Update layout for better readability
fig.update_layout(
    title='Absolute Difference in Correlation Matrices',
    xaxis_title='Features',
    yaxis_title='Features',
    width=1000,
    height=800
)

# Show the plot
fig.show()