In [1]:
import os
import pandas as pd
import numpy as np

from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

from matplotlib import pyplot as plt


In [2]:
input_dir_path = '../analysis/ready_group/width_wise/All/all/'

In [3]:
filename = "final.csv"
input_df = pd.read_csv(os.path.join(input_dir_path, filename))
input_df['ID'] = input_df['ID'].str.replace(r'^f', '9', regex=True)
input_df['window_id'] = input_df['ID'].astype(str) + '_' + input_df['window'].astype(int).apply(lambda x: f'{x:04d}')

Y = input_df[["window_id",'ID','success','success_categorical']].copy()
mapping = {'high': 1, 'low': 0}
Y['success_categorical'] = Y['success_categorical'].map(mapping)
display(Y)
input_df.drop("success_categorical", inplace=True, axis=1)
input_df.drop("success", inplace=True, axis=1)
display(input_df)


Unnamed: 0,window_id,ID,success,success_categorical
0,910031_0001,910031,5,1
1,910031_0002,910031,5,1
2,910031_0003,910031,5,1
3,910031_0004,910031,5,1
4,910031_0005,910031,5,1
...,...,...,...,...
5885,910006_0063,910006,3,0
5886,910006_0064,910006,3,0
5887,910006_0065,910006,3,0
5888,910006_0066,910006,3,0


Unnamed: 0,window,ID,user_2_num_fixations,user_2_fixation_ratio,user_2_fixation_durations_mean,user_2_fixation_durations_median,user_2_fixation_durations_min,user_2_fixation_durations_max,user_2_fixation_durations_std,user_2_fixation_dispersions_mean,...,time_diff,neutral_change,happy_change,sad_change,angry_change,fearful_change,disgusted_change,surprised_change,time_diff_change,window_id
0,1.0,910031,142.0,0.258652,140.425946,89.538574,18.950195,933.806885,145.537797,39.281922,...,14.766104,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,910031_0001
1,2.0,910031,136.0,0.291845,161.609962,93.777954,18.950195,1772.697998,218.578415,46.115280,...,18.990136,-0.001627,0.001542,-0.000116,0.000067,7.948154e-07,1.792059e-04,-0.000046,4.224032,910031_0002
2,3.0,910031,111.0,0.212237,173.819613,96.728027,18.950195,1772.697998,236.315603,50.217409,...,24.440235,-0.050757,0.060207,-0.009528,0.000070,-3.268232e-08,1.818222e-05,-0.000010,5.450100,910031_0003
3,4.0,910031,94.0,0.175373,191.474344,120.198486,20.920898,1772.697998,253.898080,56.490419,...,29.721428,-0.035757,0.035753,-0.000032,-0.000078,1.070729e-06,1.332743e-05,0.000099,5.281192,910031_0004
4,5.0,910031,84.0,0.171779,212.406238,145.857910,21.939941,1772.697998,265.778521,60.947407,...,34.808612,-0.072987,0.073023,-0.000005,-0.000027,1.648964e-06,2.159010e-05,-0.000028,5.087184,910031_0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5885,63.0,910006,237.0,0.279152,73.593671,48.973877,17.952148,534.600830,70.110923,22.753659,...,324.911382,-0.017148,-0.000051,0.017036,-0.000008,-7.047028e-08,-2.662790e-07,0.000170,4.969743,910006_0063
5886,64.0,910006,231.0,0.286600,77.907588,47.848145,17.952148,1033.776123,95.532309,26.949262,...,329.912501,0.011785,-0.002855,-0.009012,0.000043,-3.716472e-05,3.091413e-06,0.000072,5.001119,910006_0064
5887,65.0,910006,224.0,0.284264,81.282426,51.088867,17.952148,1033.776123,96.210971,28.187855,...,334.948692,-0.078595,0.062743,0.001859,0.000043,1.170641e-05,1.904388e-05,0.013920,5.036191,910006_0065
5888,66.0,910006,200.0,0.298063,98.499094,57.062134,17.952148,1192.486084,139.831733,30.458274,...,339.810184,0.019549,0.001163,-0.029053,-0.000058,-2.348487e-05,2.853321e-07,0.008421,4.861492,910006_0066


In [4]:
input_df = input_df.loc[:, ~input_df.columns.str.endswith('min')]
input_df = input_df.loc[:, ~input_df.columns.str.endswith('median')]
input_df = input_df.loc[:, ~input_df.columns.str.endswith('max')]

display(input_df)

Unnamed: 0,window,ID,user_2_num_fixations,user_2_fixation_ratio,user_2_fixation_durations_mean,user_2_fixation_durations_std,user_2_fixation_dispersions_mean,user_2_fixation_dispersions_std,user_2_num_saccades,user_2_saccade_mean_distance,...,time_diff,neutral_change,happy_change,sad_change,angry_change,fearful_change,disgusted_change,surprised_change,time_diff_change,window_id
0,1.0,910031,142.0,0.258652,140.425946,145.537797,39.281922,64.722229,407.0,102.470916,...,14.766104,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,910031_0001
1,2.0,910031,136.0,0.291845,161.609962,218.578415,46.115280,107.730445,330.0,107.939090,...,18.990136,-0.001627,0.001542,-0.000116,0.000067,7.948154e-07,1.792059e-04,-0.000046,4.224032,910031_0002
2,3.0,910031,111.0,0.212237,173.819613,236.315603,50.217409,118.597660,412.0,126.752343,...,24.440235,-0.050757,0.060207,-0.009528,0.000070,-3.268232e-08,1.818222e-05,-0.000010,5.450100,910031_0003
3,4.0,910031,94.0,0.175373,191.474344,253.898080,56.490419,128.274834,442.0,131.650923,...,29.721428,-0.035757,0.035753,-0.000032,-0.000078,1.070729e-06,1.332743e-05,0.000099,5.281192,910031_0004
4,5.0,910031,84.0,0.171779,212.406238,265.778521,60.947407,135.024030,405.0,133.101932,...,34.808612,-0.072987,0.073023,-0.000005,-0.000027,1.648964e-06,2.159010e-05,-0.000028,5.087184,910031_0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5885,63.0,910006,237.0,0.279152,73.593671,70.110923,22.753659,64.052163,612.0,54.441640,...,324.911382,-0.017148,-0.000051,0.017036,-0.000008,-7.047028e-08,-2.662790e-07,0.000170,4.969743,910006_0063
5886,64.0,910006,231.0,0.286600,77.907588,95.532309,26.949262,87.341035,575.0,56.370294,...,329.912501,0.011785,-0.002855,-0.009012,0.000043,-3.716472e-05,3.091413e-06,0.000072,5.001119,910006_0064
5887,65.0,910006,224.0,0.284264,81.282426,96.210971,28.187855,88.576113,564.0,61.779183,...,334.948692,-0.078595,0.062743,0.001859,0.000043,1.170641e-05,1.904388e-05,0.013920,5.036191,910006_0065
5888,66.0,910006,200.0,0.298063,98.499094,139.831733,30.458274,93.507148,471.0,65.755164,...,339.810184,0.019549,0.001163,-0.029053,-0.000058,-2.348487e-05,2.853321e-07,0.008421,4.861492,910006_0066


In [5]:
loc_faceApi = input_df.columns.get_loc("window.5")
print(loc_faceApi)
faceApi_df = input_df.iloc[:, 164:]
display(faceApi_df)
faceApi_df.set_index("window_id", inplace=True)

163


Unnamed: 0,neutral,happy,sad,angry,fearful,disgusted,surprised,time_diff,neutral_change,happy_change,sad_change,angry_change,fearful_change,disgusted_change,surprised_change,time_diff_change,window_id
0,0.745381,0.239552,0.009881,0.004027,6.784240e-07,0.000101,0.001058,14.766104,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00,0.000000,0.000000,910031_0001
1,0.743753,0.241094,0.009765,0.004094,1.473239e-06,0.000280,0.001012,18.990136,-0.001627,0.001542,-0.000116,0.000067,7.948154e-07,1.792059e-04,-0.000046,4.224032,910031_0002
2,0.692996,0.301301,0.000237,0.004164,1.440557e-06,0.000298,0.001002,24.440235,-0.050757,0.060207,-0.009528,0.000070,-3.268232e-08,1.818222e-05,-0.000010,5.450100,910031_0003
3,0.657239,0.337054,0.000206,0.004087,2.511287e-06,0.000311,0.001101,29.721428,-0.035757,0.035753,-0.000032,-0.000078,1.070729e-06,1.332743e-05,0.000099,5.281192,910031_0004
4,0.584252,0.410077,0.000200,0.004060,4.160250e-06,0.000333,0.001073,34.808612,-0.072987,0.073023,-0.000005,-0.000027,1.648964e-06,2.159010e-05,-0.000028,5.087184,910031_0005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5885,0.903912,0.004739,0.086136,0.000344,7.145080e-05,0.000001,0.004797,324.911382,-0.017148,-0.000051,0.017036,-0.000008,-7.047028e-08,-2.662790e-07,0.000170,4.969743,910006_0063
5886,0.915696,0.001884,0.077124,0.000387,3.428609e-05,0.000005,0.004870,329.912501,0.011785,-0.002855,-0.009012,0.000043,-3.716472e-05,3.091413e-06,0.000072,5.001119,910006_0064
5887,0.837101,0.064627,0.078983,0.000430,4.599250e-05,0.000024,0.018790,334.948692,-0.078595,0.062743,0.001859,0.000043,1.170641e-05,1.904388e-05,0.013920,5.036191,910006_0065
5888,0.856650,0.065790,0.049930,0.000372,2.250762e-05,0.000024,0.027211,339.810184,0.019549,0.001163,-0.029053,-0.000058,-2.348487e-05,2.853321e-07,0.008421,4.861492,910006_0066


In [6]:
loc_eye_tracking_end = input_df.columns.get_loc('window.1')
loc_eye_ = input_df.columns.get_loc('window.4')

eye_tracking_df = input_df.iloc[:, 4:loc_eye_tracking_end]
eye_tracking_df_2 = input_df.iloc[:, loc_eye_:162]

eye_df = pd.concat([eye_tracking_df, eye_tracking_df_2], axis=1)
eye_df.insert(0, 'window_id', input_df['window_id'])
eye_df.set_index("window_id", inplace=True)
eye_df.drop('window.4', inplace=True, axis=1)


display(eye_df)


Unnamed: 0_level_0,user_2_fixation_durations_mean,user_2_fixation_durations_std,user_2_fixation_dispersions_mean,user_2_fixation_dispersions_std,user_2_num_saccades,user_2_saccade_mean_distance,user_2_saccade_median_distance,user_2_saccade_min_distance,user_2_saccade_max_distance,user_2_saccade_std_distance,...,user_1_pupil_diam_mean,user_1_pupil_diam_std,cognitive_load_mean_users,cognitive_load_diff_users,pupil_diam_mean,pupil_diam_std,pupil_diam_mean_diff,pupil_diam_med_diff,pupil_diam_min_diff,pupil_diam_max_diff
window_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
910031_0001,140.425946,145.537797,39.281922,64.722229,407.0,102.470916,92.885984,15.227447,556.198350,69.385452,...,2.743819,0.268769,0.000000,0.000000,3.491106,0.275511,1.494574,1.521896,1.231598,1.176598
910031_0002,161.609962,218.578415,46.115280,107.730445,330.0,107.939090,105.792693,15.227447,329.898273,64.776887,...,2.713145,0.210178,0.000000,0.000000,3.412119,0.248689,1.397949,1.343826,0.609123,1.176598
910031_0003,173.819613,236.315603,50.217409,118.597660,412.0,126.752343,126.455455,15.227447,322.882461,71.462710,...,2.679538,0.186468,0.000000,0.000000,3.352677,0.235120,1.346277,1.305588,0.609123,1.176598
910031_0004,191.474344,253.898080,56.490419,128.274834,442.0,131.650923,127.392026,15.793274,749.876513,74.007842,...,2.682115,0.180730,0.000000,0.000000,3.323992,0.231009,1.283755,1.256332,0.953796,1.176598
910031_0005,212.406238,265.778521,60.947407,135.024030,405.0,133.101932,125.345914,15.793274,749.876513,73.606402,...,2.733904,0.207563,1.631004,0.315882,3.328806,0.241770,1.189803,1.160912,0.953796,1.176598
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910006_0063,73.593671,70.110923,22.753659,64.052163,612.0,54.441640,49.704304,13.707895,296.106966,29.567561,...,4.121134,1.003054,2.106011,0.721400,3.543791,0.603100,1.154686,1.291161,0.130585,2.199776
910006_0064,77.907588,95.532309,26.949262,87.341035,575.0,56.370294,48.547525,13.707895,310.994461,37.016351,...,3.913352,0.907191,2.018107,0.633257,3.430668,0.543780,0.965368,1.115486,0.130585,2.126556
910006_0065,81.282426,96.210971,28.187855,88.576113,564.0,61.779183,50.348130,13.707895,720.426972,49.300116,...,3.686351,0.865299,2.009176,0.785544,3.325117,0.525720,0.722468,0.960224,0.130585,2.126556
910006_0066,98.499094,139.831733,30.458274,93.507148,471.0,65.755164,53.006244,13.707895,720.426972,55.540236,...,3.740577,0.844012,1.768679,0.600778,3.360914,0.511618,0.759326,0.967552,0.130585,2.126556


In [7]:
loc_wristband_data_start = input_df.columns.get_loc('window.1')
loc_wristband_data_end = input_df.columns.get_loc('window.4')

wristband_df = input_df.iloc[:, loc_wristband_data_start + 1 : (loc_wristband_data_end - 1)]
wristband_df.insert(0, 'window_id', input_df['window_id'])
wristband_df.set_index("window_id", inplace=True)
wristband_df.drop('window.3', inplace=True, axis=1)
wristband_df.drop('window.2', inplace=True, axis=1)

display(wristband_df)


Unnamed: 0_level_0,user_2_stress_values,user_1_stress_values,stress_values_mean,stress_values_diff,user_2_avg_hr,user_2_hr_variance,user_2_max_hr,user_2_min_hr,user_2_hr_diff,user_2_autocorrelation_1,...,user_1_arousal,user_1_engagement_amplitude,user_1_engagement_nr_peaks,user_1_engagement_auc,engagement_amplitude_mean,engagement_nr_peaks_mean,engagement_auc_mean,arousal,engagement_amplitude_diff,engagement_nr_peaks_diff
window_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
910031_0001,-5.761697e-15,-0.005333,-0.002667,0.005333,102.546060,63.086856,112.93600,93.65418,19.28182,1.0,...,0.007687,0.000000,0.0,0.340793,0.000000,0.0,0.665168,0.001282,0.000000,0.0
910031_0002,-1.379310e-03,0.003626,0.001123,0.005005,96.082152,103.916891,112.93600,83.47441,29.46159,1.0,...,0.019218,0.000000,0.0,1.192267,0.013892,0.5,2.284386,0.032026,0.027784,1.0
910031_0003,-1.379592e-03,-0.001649,-0.001514,0.000269,98.300560,106.093929,112.93600,83.47441,29.46159,1.0,...,0.029467,0.000000,0.0,2.035947,0.042620,1.5,3.865110,0.062771,0.085240,3.0
910031_0004,-1.613445e-03,-0.000590,-0.001102,0.001023,94.367075,120.479366,106.66170,81.69833,24.96337,1.0,...,0.043560,0.000000,0.0,2.774940,0.041218,1.5,5.398991,0.070457,0.082436,3.0
910031_0005,-1.387811e-03,-0.000406,-0.000897,0.000982,92.358975,105.098912,106.66170,78.36371,28.29799,1.0,...,0.051247,0.000000,0.0,3.595436,0.031049,1.0,6.931472,0.088393,0.062098,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
910006_0063,-5.417043e-04,-0.002476,-0.001509,0.001934,69.645675,2.924932,72.44947,67.36530,5.08417,1.0,...,2.554014,3.005183,5.0,197.278432,1.516530,3.0,108.114340,2.392595,2.977306,4.0
910006_0064,-4.422529e-04,-0.001396,-0.000919,0.000953,68.037958,5.223049,72.44947,63.99704,8.45243,1.0,...,2.802547,2.888339,4.0,199.305570,1.444170,2.0,109.062423,2.632161,2.888339,4.0
910006_0065,-6.019862e-04,-0.000582,-0.000592,0.000020,68.136041,4.668457,72.44947,63.99704,8.45243,1.0,...,2.305481,1.940700,2.0,199.808765,0.982582,1.5,109.319754,2.145344,1.916236,1.0
910006_0066,-6.219876e-04,0.000280,-0.000171,0.000902,67.814535,5.055654,72.44947,63.99704,8.45243,1.0,...,3.117202,3.103094,3.0,196.761234,1.551547,1.5,107.845954,2.953221,3.103094,3.0


In [17]:
X_train, X_test, y_train, y_test = train_test_split(faceApi_df, Y['success'], test_size=0.4)

In [18]:
rf_face = RandomForestClassifier(n_estimators=100)
rf_face.fit(X_train, y_train)

In [19]:
y_pred = rf_face.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
sorted_features = sorted(zip(faceApi_df.columns, rf_face.feature_importances_), key=lambda x: x[1])

last_30_features = sorted_features[-30:][::-1]

print("{:<5s} | {:<30s} | {:<10s}".format("Rank", "Feature", "Importance"))

# Print the separator
print("-"*50)

# Print each feature
for i, (feature, importance) in enumerate(last_30_features, start=1):
    print("{:<5d} | {:<30s} | {:<10.6f}".format(i, feature, importance))

print(accuracy)

Rank  | Feature                        | Importance
--------------------------------------------------
1     | surprised                      | 0.119290  
2     | sad                            | 0.115141  
3     | fearful                        | 0.094511  
4     | disgusted                      | 0.090020  
5     | angry                          | 0.082920  
6     | time_diff                      | 0.079689  
7     | happy                          | 0.075827  
8     | neutral                        | 0.066688  
9     | time_diff_change               | 0.046472  
10    | sad_change                     | 0.036576  
11    | disgusted_change               | 0.035228  
12    | surprised_change               | 0.033650  
13    | happy_change                   | 0.032501  
14    | fearful_change                 | 0.031176  
15    | angry_change                   | 0.030864  
16    | neutral_change                 | 0.029447  
0.8314940577249575


In [11]:
X_train, X_test, y_train, y_test = train_test_split(eye_df, Y['success_categorical'], test_size=0.4)

In [12]:
rf_eye = RandomForestClassifier(n_estimators=100)
rf_eye.fit(X_train, y_train)

In [13]:
y_pred = rf_eye.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
sorted_features = sorted(zip(eye_df.columns, rf_eye.feature_importances_), key=lambda x: x[1])

last_30_features = sorted_features[-30:][::-1]

print("{:<5s} | {:<30s} | {:<10s}".format("Rank", "Feature", "Importance"))

# Print the separator
print("-"*50)

# Print each feature
for i, (feature, importance) in enumerate(last_30_features, start=1):
    print("{:<5d} | {:<30s} | {:<10.6f}".format(i, feature, importance))

print(accuracy)

Rank  | Feature                        | Importance
--------------------------------------------------
1     | user_1_IPI                     | 0.057690  
2     | pupil_diam_mean_diff           | 0.039389  
3     | user_1_pupil_diam_mean         | 0.032981  
4     | pupil_diam_med_diff            | 0.032647  
5     | user_2_saccade_median_distance | 0.031547  
6     | user_2_saccade_mean_distance   | 0.029334  
7     | pupil_diam_max_diff            | 0.025643  
8     | pupil_diam_mean                | 0.025231  
9     | user_2_percieved_difficulty_std | 0.022552  
10    | percieved_difficulty_std       | 0.019844  
11    | user_1_percieved_difficulty_std | 0.019696  
12    | user_2_pupil_diam_mean         | 0.019489  
13    | user_1_ipa_value               | 0.019175  
14    | saccade_mean_velocity_diff     | 0.019031  
15    | user_1_pupil_diam_std          | 0.018559  
16    | saccade_median_distance        | 0.015058  
17    | pupil_diam_std                 | 0.014321  
18    | IPI

In [14]:
X_train, X_test, y_train, y_test = train_test_split(wristband_df, Y['success_categorical'], test_size=0.4)

In [15]:
rf_wrist = RandomForestClassifier(n_estimators=100)
rf_wrist.fit(X_train, y_train)

In [16]:
y_pred = rf_wrist.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
sorted_features = sorted(zip(wristband_df.columns, rf_wrist.feature_importances_), key=lambda x: x[1])

last_30_features = sorted_features[-30:][::-1]

print("{:<5s} | {:<30s} | {:<10s}".format("Rank", "Feature", "Importance"))

# Print the separator
print("-"*50)

# Print each feature
for i, (feature, importance) in enumerate(last_30_features, start=1):
    print("{:<5d} | {:<30s} | {:<10.6f}".format(i, feature, importance))

print(accuracy)

Rank  | Feature                        | Importance
--------------------------------------------------
1     | engagement_auc_mean            | 0.124787  
2     | user_1_engagement_auc          | 0.114357  
3     | user_2_engagement_auc          | 0.095900  
4     | autocorrelation_2_mean         | 0.060119  
5     | arousal                        | 0.032488  
6     | user_2_engagement_amplitude    | 0.032326  
7     | user_1_autocorrelation_2       | 0.026018  
8     | user_2_arousal                 | 0.025171  
9     | autocorrelation_1_mean         | 0.024929  
10    | engagement_amplitude_diff      | 0.023375  
11    | hr_variance                    | 0.022725  
12    | engagement_amplitude_mean      | 0.020949  
13    | user_1_min_hr                  | 0.020494  
14    | user_1_max_hr                  | 0.019404  
15    | stress_values_diff             | 0.019385  
16    | hr_diff                        | 0.018687  
17    | user_1_arousal                 | 0.018323  
18    | min_h