In [183]:
import pandas as pd
import os
import yaml
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

font = {'size': 12}
matplotlib.rc('font', **font)
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [184]:
df = pd.read_csv("final.csv")
# Commented because there is no more engagement from the tablet
# df['eng'] = df[['eng_qt', 'eng_tab']].mean(axis=1)
df['eng'] = df['eng_qt']
del df['eng_qt']
features = df.groupby(['participant_id', 'length']).mean()[['correct', 'eng']]
features = features.reset_index()
features

Unnamed: 0,participant_id,length,correct,eng
0,03DEQR1O,3,1.0,0.5961
1,03DEQR1O,5,0.5,0.49785
2,03DEQR1O,7,0.75,0.389925
3,1CZ1CL1P,3,0.666667,0.7139
4,1CZ1CL1P,5,0.25,0.394925
5,1CZ1CL1P,7,0.0,0.5087
6,1MBU59SJ,3,0.666667,-0.469133
7,1MBU59SJ,5,1.0,-0.601225
8,1MBU59SJ,7,0.333333,-0.442033
9,1PE38CJI,3,1.0,-0.160525


In [185]:
user_vectors = []
for key in features['participant_id'].unique():
    user_features = np.array(features.loc[features['participant_id']==key, ['correct', 'eng']])
    user_vectors.append(user_features.flatten())

user_vectors = np.array(user_vectors)
user_vectors

array([[ 1.        ,  0.5961    ,  0.5       ,  0.49785   ,  0.75      ,
         0.389925  ],
       [ 0.66666667,  0.7139    ,  0.25      ,  0.394925  ,  0.        ,
         0.5087    ],
       [ 0.66666667, -0.46913333,  1.        , -0.601225  ,  0.33333333,
        -0.44203333],
       [ 1.        , -0.160525  ,  1.        , -0.6674    ,  0.33333333,
        -0.7275    ],
       [ 1.        , -0.5534    ,  0.75      , -0.213275  ,  0.66666667,
        -0.8146    ],
       [ 1.        , -0.7723    ,  1.        , -0.2941    ,  0.75      ,
        -0.46405   ],
       [ 1.        , -0.905125  ,  1.        , -0.9792    ,  0.66666667,
        -0.95126667],
       [ 1.        ,  0.600725  ,  1.        ,  0.3014    ,  0.        ,
        -0.0469    ],
       [ 1.        , -0.575     ,  1.        ,  0.5181    ,  0.33333333,
        -0.59173333],
       [ 1.        ,  0.88815   ,  1.        ,  0.986925  ,  0.5       ,
         0.97305   ],
       [ 1.        , -0.45415   ,  1.        , -0.

In [186]:
embedding = MDS(n_components=2)
X = embedding.fit_transform(user_vectors)
clusters = KMeans(n_clusters=2, random_state=0).fit(X)

In [187]:
embedding_pca = PCA(n_components=2)
X_pca = embedding.fit_transform(user_vectors)
clusters_pca = KMeans(n_clusters=2, random_state=0).fit(X_pca)

In [188]:
print(f"PCA gives same result as MDS: {np.all(clusters.labels_ == clusters_pca.labels_)}")

PCA gives same result as MDS: True


In [189]:
# Plotting projection of both user clusters
label_0 = [id for id, label in enumerate(clusters_pca.labels_) if label==0]
label_1 = [id for id, label in enumerate(clusters_pca.labels_) if label==1]
plt.figure(figsize=(3, 3), dpi=100)
plt.scatter(X[label_0, 0], X[label_0, 1], c='r', label=r'$C_1$')
plt.scatter(X[label_1, 0], X[label_1, 1], c='b', label='$C_2$')
plt.xlabel("FPC")
plt.ylabel("SPC")
plt.xticks([-1, 0, 1])
plt.yticks([-1, 0, 1])
plt.gca().set_aspect('equal', adjustable='box')
plt.grid(zorder=0)
plt.legend(frameon=True)
plt.savefig(os.path.join('output', f"clusters.pdf"),
            bbox_inches='tight')

In [190]:
df['cluster'] = 0
for user, cluster in zip(features['participant_id'].unique(), clusters_pca.labels_):
    df.loc[df['participant_id'] == user, 'cluster'] = cluster

In [191]:
# Plotting statistics of both cluster users
df_cluster_plt = df.copy(deep=True)
length_difficulty_map = {3:1, 5:2, 7:3}
df_cluster_plt['difficulty'] = 0

for length in length_difficulty_map:
    df_cluster_plt.loc[df_cluster_plt['length']==length, 'difficulty'] = length_difficulty_map[length]

cluster_features_plot = df_cluster_plt.groupby(['cluster', 'difficulty']).mean()[['correct', 'eng']].reset_index()
cluster_features_std_plot = df_cluster_plt.groupby(['cluster', 'difficulty']).std()[['correct', 'eng']].reset_index()
fig, axes = plt.subplots(2, sharex=True, figsize=(5, 3), dpi=100)
label_map = {'correct': r'$P($success$|L)$', 'eng': 'Engagement'}

for feature, ax in zip(['eng', 'correct'], axes):
    for cluster, color in zip([0, 1], ['red', 'blue']):
        plt_data = cluster_features_plot.loc[cluster_features_plot['cluster']==cluster, ['difficulty', feature]]
        plt_data['std'] = cluster_features_std_plot.loc[cluster_features_std_plot['cluster']==cluster, feature]
        plt_data['min'] = plt_data[feature] - plt_data['std']
        plt_data['max'] = plt_data[feature] + plt_data['std']

        if feature=='eng':
            plt_data.loc[plt_data['min']<-1, 'min'] = -1
        elif feature=='correct':
            plt_data.loc[plt_data['min']<0, 'min'] = 0

        plt_data.loc[plt_data['max']>1, 'max'] = 1

        ax.errorbar(plt_data['difficulty'], plt_data[feature], yerr=(plt_data[feature]-plt_data['min'], plt_data['max']-plt_data[feature]), fmt='--o', linewidth=1, color=color,
             ecolor=f"tab:{color}",elinewidth=1 , capsize=4, label=f"$C_{cluster+1}")

    if feature=='eng':
        ax.set(ylabel=label_map[feature], xticks=[1, 2, 3], yticks=[-1, 0, 1])
    elif feature=='correct':
        ax.set(ylabel=label_map[feature], xticks=[1, 2, 3], yticks=[0, 0.5, 1])
    ax.grid(zorder=0)
fig.supxlabel('Difficulty Level')
plt.legend(frameon=True)
plt.savefig(os.path.join('output', f"clusters_statistics.pdf"),
            bbox_inches='tight')

In [192]:
# Checking if giving feedback has any effect
final_df_cpy = df.copy(deep=True)
final_df_cpy['eng_diff'] = 0

f_id = np.array(final_df_cpy.index[final_df_cpy['feedback']>0])
f_id_prev = f_id - 1
eng_curr = np.array(final_df_cpy.loc[f_id, 'eng'])
eng_prev = np.array(final_df_cpy.loc[f_id_prev, 'eng'])
eng_diff = eng_curr - eng_prev
final_df_cpy.loc[f_id, 'eng_diff'] = eng_diff

In [193]:
#What is the probability that change in engagement is positive after giving feedback
final_df_cpy.loc[final_df_cpy['eng_diff']>0, ['cluster','eng_diff']].groupby('cluster').count()/final_df_cpy.loc[final_df_cpy['feedback']>0, ['cluster','eng_diff']].groupby('cluster').count()

Unnamed: 0_level_0,eng_diff
cluster,Unnamed: 1_level_1
0,0.477273
1,0.555556


In [194]:
task_map = {-3: -1, -5:-2, -7:-3, 3: 1, 5:2, 7:3}
result_map = {1:1, 0:-1}
action_map = {(3, 0): 0, (5, 0): 1, (7, 0): 2, (3, 1): 3, (5, 1): 3, (7, 1): 3, (3, 2): 4, (5, 2): 4, (7, 2): 4}

df_fin = df.copy(deep=True)

df_fin['current_score'] = 0
df_fin['previous_score'] = 0
df_fin['current_result'] = 0
df_fin['action'] = 0

df_fin['current_result'] = [result_map[result] for result in list(df_fin['correct'])]
df_fin['current_score'] = [task_map[current_score*current_result] for current_score, current_result in zip(list(df_fin['length']), list(df_fin['current_result']))]

df_fin['engagement'] = df_fin['eng']
del df_fin['eng']
del df_fin['correct']

df_fin['robot_feedback'] = df_fin['feedback']
del df_fin['feedback']

# Add previous score and action fields
for user in df_fin['participant_id'].unique():
    df_fin.loc[df_fin['participant_id']==user, 'previous_score'] = df_fin.loc[df_fin['participant_id']==user, 'current_score'].shift(periods=1, fill_value=0)

    actions = np.roll(np.array([action_map[(length, feedback)] for length, feedback in
                                zip(list(df_fin.loc[df_fin['participant_id']==user, 'length']),
                                    list(df_fin.loc[df_fin['participant_id']==user, 'robot_feedback']))]), -1)
    actions[-1] = -1
    df_fin.loc[df_fin['participant_id']==user, 'action'] = actions

df_fin = df_fin.reindex(columns=['cluster','participant_id','secs','length','current_score', 'previous_score', 'current_result', 'robot_feedback', 'action', 'engagement', 'duration', 'id'])

In [195]:
df_fin

Unnamed: 0,cluster,participant_id,secs,length,current_score,previous_score,current_result,robot_feedback,action,engagement,duration,id
0,0,1CZ1CL1P,1648134409,7,-3,0,-1,0,2,1.0000,17.0509,0
1,0,1CZ1CL1P,1648134452,7,-3,-3,-1,0,3,0.3114,12.4892,1
2,0,1CZ1CL1P,1648134490,5,-2,-3,-1,1,1,0.6612,6.6176,2
3,0,1CZ1CL1P,1648134519,5,-2,-2,-1,0,4,0.6728,5.2145,3
4,0,1CZ1CL1P,1648134550,3,1,-2,1,2,0,0.6064,6.2580,4
...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,1PE38CJI,1647955491,7,-3,3,-1,0,2,-0.7966,8.5898,5
196,1,1PE38CJI,1647955532,7,-3,-3,-1,0,4,-0.6516,11.3520,6
197,1,1PE38CJI,1647955578,5,2,-3,1,2,1,-0.4588,13.0154,7
198,1,1PE38CJI,1647955612,5,2,2,1,0,3,-0.7897,9.0707,8


In [196]:
df_fin.to_csv("output/final_clustered.csv", header=True, index=None, sep=',', float_format='%10.4f', mode='w')