In [1]:
# Basic libraries
import pandas as pd
import numpy as np
import math

# Models
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

# Cross-Validation
from sklearn.model_selection import GroupKFold

# Prediction Scoring
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import make_scorer
from statistics import mean

# Ignore warnings
import warnings
warnings.simplefilter('ignore')  

### Loading the test group for prediction

In [2]:
# Load features from control  
df = pd.read_csv('Final_Features_control.csv')

# Scale data for better perfomance
scaler = StandardScaler()

# Extract name of columns in df as a list
columns = list(df.columns)

# Eliminate both participant name and empathy column because they are not a features
columns.pop(0)  
columns.pop(-1)

# Scale data by columns 
df[columns] = scaler.fit_transform(df[columns])

### Categorizing our label
We are passing our label from a continuous to a discrete variable for a better __interpretation__ of the results and for the __simplicity__ of the predictions. We use the `pd.cut` function from pandas to create the bins and the respective label. In __Table 1__, we explain the process done.

In [3]:
# Discretizing my label/target
df['Empathy Score'] = pd.cut(df["Empathy Score"],
       bins=[80, 100, 110, 120, 130], 
       labels=[0, 1, 2, 3])

### <center> Table 1
| Empathy Range | Class | Label |
| :-: | :-: | :-: |
| <100 | Bad | 0 |
| 100-110 | Average | 1 |
| 110-120 | Good | 2 |
| >120 | Outstanding | 3 |

In [4]:
# Print DataFrame to see the results
df.head()

Unnamed: 0,Participant name,Mean Pupil diameter left,Std Pupil diameter left,Mean Pupil diameter right,Std Pupil diameter right,Num. of Fixations,Num. of Saccades,Num. of Unclassified,Recording duration (s),Mean Gaze event duration (s),Mean Fixation point X,Std Fixation point X,Mean Fixation point Y,Std Fixation point Y,Mean Gaze point X,Std Gaze point X,Mean Gaze point Y,Std Gaze point Y,Empathy Score
0,2.0,-0.432974,0.419118,-0.421194,0.464759,1.587351,1.18904,-0.146779,1.194853,-0.207512,-0.634649,-0.13446,-0.991371,0.10938,-0.779279,-0.242149,-0.64718,-0.143647,2
1,2.0,-0.879696,-0.040243,-0.905519,0.324219,-1.40753,-0.923646,-1.105699,-1.694376,-0.244112,0.577072,-1.720481,-0.303356,0.415151,0.549348,-1.829771,0.165859,0.930937,2
2,2.0,-0.617033,-0.581028,-0.705775,-0.525981,-1.541145,-0.881219,-0.809038,-1.657819,-0.247426,0.403903,-1.32023,-0.103427,-1.240654,0.527018,-1.203205,-0.003172,-1.156548,2
3,2.0,-0.633217,-0.44787,-0.617493,-0.373333,-1.564468,-1.079856,-1.144688,-1.825463,-0.126731,-0.448392,-0.947675,-0.038253,-0.04615,-0.476786,-1.194187,0.082513,-0.25764,2
4,2.0,-0.556424,-0.003391,-0.456996,0.333323,3.975823,4.980497,2.40224,5.0252,-0.283657,-0.484989,0.319513,-0.290735,0.524766,-0.619729,0.26881,0.013531,0.388368,2


### Split data using KFolds
We are splitting our training and testing data using Group KFolds because of the nature of our labels, in essence we have a bunch of rows that that share the same label so we dont want any leakage of data from the testing data to the training data. In this code we separate the features and our label first and then create the k-folds.

In [5]:
# Drop participant name and empathy column because they are not a feature
X = df.drop(['Participant name','Empathy Score'], axis=1)

# Store label in variable 'y'
y = df['Empathy Score']

# Store group indexes in 'gps'
gps = df['Participant name']

# Create Group KFolds
kf = GroupKFold(n_splits=5)

In [6]:
for train_index, test_index in kf.split(X, y, groups=gps):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [7]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(116, 17)
(31, 17)
(116,)
(31,)


## Classification Models

In [8]:
# DUMMY CLASSIFIER
dummy = DummyClassifier(strategy ='prior') 
dummy.fit(X_train, y_train)
y_pred_dummy = cross_val_score(dummy, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# DECISION TREE CLASSIFIER
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = cross_val_score(dt, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# ADABOOST CLASSIFIER
ada = AdaBoostClassifier(n_estimators=100)
ada.fit(X_train, y_train)
y_pred_ada = cross_val_score(ada, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# NN CLASSIFIER
nn = MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(5, 2),random_state=1)
nn.fit(X_train, y_train)
y_pred_nn = cross_val_score(nn, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# SVM CLASSIFIER
svc = SVC(random_state=13)
svc.fit(X_train, y_train)
y_pred_svc = cross_val_score(svc, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# SGD CLASSIFIER
sgd = SGDClassifier(random_state=13)
sgd.fit(X_train, y_train)
y_pred_sgd = cross_val_score(sgd, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# KNN CLASSIFIER
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred_knn = cross_val_score(knn, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# NEAREST CENTROID
ncc = NearestCentroid()
ncc.fit(X_train, y_train)
y_pred_ncc = cross_val_score(ncc, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# RANDOM FORREST
rfr = RandomForestClassifier(n_estimators=300, random_state=13)
rfr.fit(X_train, y_train)
y_pred_rfr = cross_val_score(rfr, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# LOGISTIC REGRESSION
logreg = LogisticRegression(random_state=13)
logreg.fit(X_train, y_train)
y_pred_logreg = cross_val_score(logreg, X, y, cv=kf, groups=gps, scoring='balanced_accuracy')

# RESULTS
print('Accuracy SGDC:     %.2f +/- %.2f' % ((y_pred_sgd.mean()),y_pred_sgd.std()),'\n')
print('Accuracy NCC:      %.2f +/- %.2f' % ((y_pred_ncc.mean()),y_pred_ncc.std()))
print('Accuracy NN:       %.2f +/- %.2f' % ((y_pred_nn.mean()),y_pred_nn.std()))
print('Accuracy LOGR:     %.2f +/- %.2f' % ((y_pred_logreg.mean()),y_pred_logreg.std()),'***','\n')
print('Accuracy Adaboost: %.2f +/- %.2f' % ((y_pred_ada.mean()),y_pred_ada.std()))
print('Accuracy DT:       %.2f +/- %.2f' % ((y_pred_dt.mean()),y_pred_dt.std()))
print('Accuracy RFR:      %.2f +/- %.2f' % ((y_pred_rfr.mean()),y_pred_rfr.std()),'\n')
print('Accuracy KNN:      %.2f +/- %.2f' % ((y_pred_knn.mean()),y_pred_knn.std()))
print('Accuracy SVC:      %.2f +/- %.2f' % ((y_pred_svc.mean()),y_pred_svc.std()))
print('Accuracy Dummy:    %.2f +/- %.2f' % ((y_pred_dummy.mean()),y_pred_dummy.std()))

Accuracy SGDC:     0.37 +/- 0.05 

Accuracy NCC:      0.28 +/- 0.07
Accuracy NN:       0.35 +/- 0.09
Accuracy LOGR:     0.39 +/- 0.10 *** 

Accuracy Adaboost: 0.28 +/- 0.08
Accuracy DT:       0.30 +/- 0.08
Accuracy RFR:      0.33 +/- 0.09 

Accuracy KNN:      0.38 +/- 0.11
Accuracy SVC:      0.32 +/- 0.06
Accuracy Dummy:    0.40 +/- 0.08


## Explainability of results

In the case of the __test group__, the __best classifier__ was `LogisticRegression`, so we are going to extract the best __features__ to understand the __behaviour__ of our predictor per class. With this in mind, we are plotting the __best predictor__ (__feature__ in this case) for each class, and their respective __coefficient or weight__.

In [25]:
max_class = logreg.coef_.argmax(axis=1)

for i in range(logreg.coef_.shape[0]):
    print('Feature:', logreg.feature_names_in_[max_class[i]])
    print('Coef: %.2f' % logreg.coef_[i,max_class[i]])


Feature: Std Gaze point X
Coef: 0.81
Feature: Mean Fixation point X
Coef: 0.71
Feature: Num. of Saccades
Coef: 0.94


^^^ __The outcome of this coefficient corresponds to how true is the feature to the class prediction.__

***CONTROL GROUP ONLY HAS EMPATHY SCORES FOR THE FIRST THREE LABELS***

In [28]:
df['Empathy Score'].unique()

[2, 1, 0]
Categories (4, int64): [0 < 1 < 2 < 3]

In [26]:
# logreg.coef_

In [11]:
# sgd.coef_[3,:].max()

In [12]:
# sgd.feature_names_in_[14]

In [13]:
# max_class = sgd.coef_.argmax(axis=1)
# max_class