In [None]:
'''
    Copyright 2022 by Michał Stolarz <michal.stolarz@h-brs.de>

    This file is part of migrave_personalised_behaviour_model.
    It is used for creating the user vectors, projecting them on the 2D space and clustering into two groups, saving the results in the csv file and plotting.

    migrave_personalised_behaviour_model is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    migrave_personalised_behaviour_model is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.
    You should have received a copy of the GNU Affero General Public License
    along with migrave_personalised_behaviour_model. If not, see <http://www.gnu.org/licenses/>.
'''

import pandas as pd
import os
import yaml
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from sklearn.manifold import MDS
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

font = {'size': 12}
matplotlib.rc('font', **font)
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})
scaler = StandardScaler()

In [None]:
df = pd.read_csv("final.csv")
df['eng'] = df['eng_qt']
del df['eng_qt']
features = df.groupby(['participant_id', 'length']).mean()[['correct', 'eng']]
features = features.reset_index()
features

In [None]:
# Creating user vectors
user_vectors = []
for key in features['participant_id'].unique():
    user_features = np.array(features.loc[features['participant_id']==key, ['correct', 'eng']])
    user_vectors.append(user_features.flatten())

user_vectors = np.array(user_vectors)
user_vectors

In [None]:
user_vectors_scaled = scaler.fit_transform(user_vectors)

In [None]:
# Projecting the user vectors into 2D space with the use of MDS and clustering
embedding_mds = MDS(n_components=2)
X_mds = embedding_mds.fit_transform(user_vectors_scaled)
clusters_mds = KMeans(n_clusters=2, random_state=0).fit(X_mds)

In [None]:
# Projecting the user vectors into 2D space with the use of PCA and clustering
embedding_pca = PCA(n_components=2)
X_pca = embedding_pca.fit_transform(user_vectors_scaled)
clusters_pca = KMeans(n_clusters=2, random_state=0).fit(X_pca)

In [None]:
print(f"PCA gives same result as MDS: {np.all(clusters_mds.labels_ == clusters_pca.labels_)}")

In [None]:
# Plotting projection of both user clusters
label_0 = [id for id, label in enumerate(clusters_pca.labels_) if label==0]
label_1 = [id for id, label in enumerate(clusters_pca.labels_) if label==1]
plt.figure(figsize=(3, 3), dpi=100)
plt.scatter(X_pca[label_0, 0], X_pca[label_0, 1], c='r', label=r'$C_1$')
plt.scatter(X_pca[label_1, 0], X_pca[label_1, 1], c='b', label='$C_2$')
plt.xlabel("FPC")
plt.ylabel("SPC")
#plt.xticks([-2, -1, 0, 1, 2])
#plt.yticks([-2, -1, 0, 1, 2])
plt.gca().set_aspect('equal', adjustable='box')
plt.grid(zorder=0)
plt.legend(frameon=True)
plt.savefig(os.path.join('output', f"clusters.pdf"),
            bbox_inches='tight')

In [None]:
# Assigning user to cluster
df['cluster'] = 0
for user, cluster in zip(features['participant_id'].unique(), clusters_pca.labels_):
    df.loc[df['participant_id'] == user, 'cluster'] = cluster

In [None]:
# Plotting statistics of both cluster users
df_cluster_plt = df.copy(deep=True)
length_difficulty_map = {3:1, 5:2, 7:3}
df_cluster_plt['difficulty'] = 0

for length in length_difficulty_map:
    df_cluster_plt.loc[df_cluster_plt['length']==length, 'difficulty'] = length_difficulty_map[length]

cluster_features_plot = df_cluster_plt.groupby(['cluster', 'difficulty']).mean()[['correct', 'eng']].reset_index()
cluster_features_std_plot = df_cluster_plt.groupby(['cluster', 'difficulty']).std()[['correct', 'eng']].reset_index()
fig, axes = plt.subplots(2, sharex=True, figsize=(5, 3), dpi=100)
label_map = {'correct': r'$P($success$|L)$', 'eng': 'Engagement'}

for feature, ax in zip(['eng', 'correct'], axes):
    for cluster, color in zip([0, 1], ['red', 'blue']):
        plt_data = cluster_features_plot.loc[cluster_features_plot['cluster']==cluster, ['difficulty', feature]]
        plt_data['std'] = cluster_features_std_plot.loc[cluster_features_std_plot['cluster']==cluster, feature]
        plt_data['min'] = plt_data[feature] - plt_data['std']
        plt_data['max'] = plt_data[feature] + plt_data['std']

        if feature=='eng':
            plt_data.loc[plt_data['min']<-1, 'min'] = -1
        elif feature=='correct':
            plt_data.loc[plt_data['min']<0, 'min'] = 0

        plt_data.loc[plt_data['max']>1, 'max'] = 1

        ax.errorbar(plt_data['difficulty'], plt_data[feature], yerr=(plt_data[feature]-plt_data['min'], plt_data['max']-plt_data[feature]), fmt='--o', linewidth=1, color=color,
             ecolor=f"tab:{color}",elinewidth=1 , capsize=4, label=f"$C_{cluster+1}")

    if feature=='eng':
        ax.set(ylabel=label_map[feature], xticks=[1, 2, 3], yticks=[-1, 0, 1])
    elif feature=='correct':
        ax.set(ylabel=label_map[feature], xticks=[1, 2, 3], yticks=[0, 0.5, 1])
    ax.grid(zorder=0)
fig.supxlabel('Difficulty Level')
plt.legend(frameon=True)
plt.savefig(os.path.join('output', f"clusters_statistics.pdf"),
            bbox_inches='tight')

In [None]:
# Saving clustering results in the pandas dataframe
task_map = {-3: -1, -5:-2, -7:-3, 3: 1, 5:2, 7:3}
result_map = {1:1, 0:-1}
action_map = {(3, 0): 0, (5, 0): 1, (7, 0): 2, (3, 1): 3, (5, 1): 3, (7, 1): 3, (3, 2): 4, (5, 2): 4, (7, 2): 4}

df_fin = df.copy(deep=True)

df_fin['current_score'] = 0
df_fin['previous_score'] = 0
df_fin['current_result'] = 0
df_fin['action'] = 0

df_fin['current_result'] = [result_map[result] for result in list(df_fin['correct'])]
df_fin['current_score'] = [task_map[current_score*current_result] for current_score, current_result in zip(list(df_fin['length']), list(df_fin['current_result']))]

df_fin['engagement'] = df_fin['eng']
del df_fin['eng']
del df_fin['correct']

df_fin['robot_feedback'] = df_fin['feedback']
del df_fin['feedback']

# Add previous score and action fields
for user in df_fin['participant_id'].unique():
    df_fin.loc[df_fin['participant_id']==user, 'previous_score'] = df_fin.loc[df_fin['participant_id']==user, 'current_score'].shift(periods=1, fill_value=0)

    actions = np.roll(np.array([action_map[(length, feedback)] for length, feedback in
                                zip(list(df_fin.loc[df_fin['participant_id']==user, 'length']),
                                    list(df_fin.loc[df_fin['participant_id']==user, 'robot_feedback']))]), -1)
    actions[-1] = -1
    df_fin.loc[df_fin['participant_id']==user, 'action'] = actions

df_fin = df_fin.reindex(columns=['cluster','participant_id','secs','length','current_score', 'previous_score', 'current_result', 'robot_feedback', 'action', 'engagement', 'duration', 'id'])

In [None]:
df_fin

In [None]:
df_fin.to_csv("output/final_clustered.csv", header=True, index=None, sep=',', float_format='%10.4f', mode='w')