In [None]:
import mne 
import pandas as pd 
import numpy as np 
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix,classification_report
import seaborn as sn 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

# In our case, each of the datasets corresponding to the emotions to be predicted is located in different files.

allDataJoy = pd.read_csv("ExperimentoMarioPostFeaturesSim/joy_procesedSimulador.csv")  
allDataAngry = pd.read_csv('ExperimentoMarioPostFeaturesSim/angry_procesedSimulador.csv')  
allDataNeutral = pd.read_csv('ExperimentoMarioPostFeaturesSim/neutral_procesedSimulador.csv')  
allDataNoStim = pd.read_csv('ExperimentoMarioPostFeaturesSim/nada_procesedSimulador.csv')  

In [None]:
# We assign the labels to each of the datasets

allDataNoStim = allDataNoStim.assign(Emotion = 0)
allDataAngry = allDataAngry.assign(Emotion = 1)
allDataNeutral = allDataNeutral.assign(Emotion = 2)
allDataJoy = allDataJoy.assign(Emotion = 3)


Next we are going to eliminate the correlated variables and apply the PCA algorithm for feature reduction.

In [None]:
allData = pd.concat([allDataNoStim, allDataAngry, allDataNeutral, allDataJoy])

In [None]:
allData = allData.reset_index(drop=True)


In [None]:
# Calculate the correlation matrix
cor_matrix  = allData.corr().abs()

# Select columns that are more than 95% correlated
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)]

# Remove the columns selected from the dataframe
allDataPostCor = allData.drop(to_drop, axis=1)

In [None]:
labels = allDataPostCor["Emotion"]
allDataPostCor = allDataPostCor.drop(["Emotion"], axis=1)


scaler = StandardScaler()
scaled_data = scaler.fit_transform(allDataPostCor)

# Apply PCA to the scaled data
pca = PCA(0.95)
pca_data = pca.fit_transform(scaled_data)

# Create a new dataframe with the PCA data
allDataPCA = pd.DataFrame(data=pca_data)


In [None]:
# Add the label column

allDataPCA["Emotion"] = labels

In [None]:
# We separate the dataframe again to be able to perform the train test data divisions correctly

allDataJoy = allDataPCA.loc[allDataPCA['Emotion'] == 3]
allDataNoStim = allDataPCA.loc[allDataPCA['Emotion'] == 0]
allDataNeutral = allDataPCA.loc[allDataPCA['Emotion'] == 2]
allDataAngry = allDataPCA.loc[allDataPCA['Emotion'] == 1]

In [None]:
# We divide each of the datasets into train and test to have balanced training and validation sets

joyCopia = allDataJoy.copy()
angerCopia = allDataAngry.copy()
neutralCopia = allDataNeutral.copy()
nadaCopia = allDataNoStim.copy()


yJoy = joyCopia[['Emotion']]

xJoy = joyCopia.drop(columns = ["Emotion"])

X_trainJoy, X_testJoy, y_trainJoy, y_testJoy = train_test_split(xJoy,yJoy, test_size=0.20, random_state=42, shuffle=False)

In [None]:
yAngry = angerCopia[['Emotion']]

xAngry = angerCopia.drop(columns = ["Emotion"])

X_trainAngry, X_testAngry, y_trainAngry, y_testAngry = train_test_split(xAngry,yAngry, test_size=0.20, random_state=42, shuffle=False)

In [None]:
yNeutral = neutralCopia[['Emotion']]

xNeutral = neutralCopia.drop(columns = ["Emotion"])

X_trainNeutral, X_testNeutral, y_trainNeutral, y_testNeutral = train_test_split(xNeutral,yNeutral, test_size=0.20, random_state=42, shuffle=False)

In [None]:
yNada = nadaCopia[['Emotion']]

xNada = nadaCopia.drop(columns = ["Emotion"])

X_trainNada, X_testNada, y_trainNada, y_testNada = train_test_split(xNada,yNada, test_size=0.20, random_state=42, shuffle=False)

In [None]:
x_train = pd.concat([X_trainJoy.copy(), X_trainAngry.copy(), X_trainNeutral.copy(), X_trainNada.copy()], axis=0)
y_train = pd.concat([y_trainJoy.copy(), y_trainAngry.copy(), y_trainNeutral.copy(), y_trainNada.copy()], axis=0)
x_test = pd.concat([X_testJoy,X_testAngry,X_testNeutral, X_testNada], axis=0)
y_test = pd.concat([y_testJoy,y_testAngry,y_testNeutral, y_testNada], axis=0)

 Training with random forest algorithm

In [None]:
rnd_clf = RandomForestClassifier()
rnd_clf.fit(x_train, y_train)

In [None]:
prediccion = rnd_clf.predict(x_test)

In [None]:
print(classification_report(y_test, prediccion))

The hyperparameter test is very expensive with the type of data we have, since each training takes around 7 minutes with random forest. So a test with 9 different parameters using gridsearch will take months, even parallelizing it.

This is followed by training with the XGBoost algorithm.

In [None]:

train_mat = xgb.DMatrix(x_train, label=y_train)

test_mat = xgb.DMatrix(x_test, label=y_test)

In [None]:
params = {"booster":"gbtree", "max_depth": 2, "eta": 0.3, "objective": "multi:softmax", "nthread":2, 'num_class':4}
rounds = 10

In [None]:
eval = [(test_mat, "eval"), (train_mat, "train")]

In [None]:
modelo = xgb.train(params, train_mat, rounds, eval)

In [None]:
prediccion = modelo.predict(test_mat)

In [None]:
print(classification_report(y_test, prediccion))

Next, the training of the kNN model is performed.

In [None]:


# Crear un diccionario con los valores a probar para el hiperparámetro k
param_grid = {'n_neighbors': [5, 7, 9, 11, 13]}

knn = KNeighborsClassifier()

# Crear una instancia de la clase GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')

# Entrenar el modelo usando la búsqueda de hiperparámetros
grid_search.fit(x_train, y_train)

# Imprimir el mejor valor de k encontrado
print(grid_search.best_params_)

In [None]:

knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(x_train,y_train)

In [None]:
prediccion = knn.predict(x_test)

In [None]:
print(classification_report(y_test, prediccion))

Next, the trainings for the ternary model with the classes Angry, Neutral and No-Stimuli

In [None]:
# Data with three classes

x_train = pd.concat([ X_trainAngry.copy(), X_trainNeutral.copy(), X_trainNada.copy()], axis=0)
y_train = pd.concat([ y_trainAngry.copy(), y_trainNeutral.copy(), y_trainNada.copy()], axis=0)
x_test = pd.concat([X_testAngry,X_testNeutral, X_testNada], axis=0)
y_test = pd.concat([y_testAngry,y_testNeutral, y_testNada], axis=0)

In [None]:
# Random forest model

rnd_clf = RandomForestClassifier()
rnd_clf.fit(x_train, y_train)

prediccion = rnd_clf.predict(x_test)

print(classification_report(y_test, prediccion))

In [None]:
# XGBoost model

train_mat = xgb.DMatrix(x_train, label=y_train)
test_mat = xgb.DMatrix(x_test, label=y_test)

params = {"booster":"gbtree", "max_depth": 2, "eta": 0.3, "objective": "multi:softmax", "nthread":2, 'num_class':3}
rounds = 10

eval = [(test_mat, "eval"), (train_mat, "train")]

model = xgb.train(params, train_mat, rounds, eval)

prediction = model.predict(test_mat)

print(classification_report(y_test, prediction))

In [None]:
# kNN model

knn = KNeighborsClassifier(n_neighbors=9)

knn.fit(x_train, y_train)

prediction = knn.predict(x_test)

print(classification_report(y_test, prediction))

Binary models:

In [None]:
x_train = pd.concat([ X_trainAngry.copy(),  X_trainNada.copy()], axis=0)
y_train = pd.concat([ y_trainAngry.copy(), y_trainNada.copy()], axis=0)
x_test = pd.concat([X_testAngry, X_testNada], axis=0)
y_test = pd.concat([y_testAngry, y_testNada], axis=0)

In [None]:
# Define the hyperparameter space to search over
param_dist = {"n_estimators": [10, 100, 1000],
              "max_depth": [3, 5, 10],
              "min_samples_split": [2, 3, 10]}

# Create a random forest classifier
rfc = RandomForestClassifier()

# Use RandomizedSearchCV to search over the hyperparameter space
search = RandomizedSearchCV(rfc, param_dist, n_jobs=4)

# Fit the model on the training data
search.fit(x_train, y_train)

# Print the best hyperparameters found
print("Best hyperparameters:", search.best_params_)

In [None]:
# Random forest model

rnd_clf = RandomForestClassifier(n_estimators=20, max_depth=10)
rnd_clf.fit(x_train, y_train)

prediccion = rnd_clf.predict(x_test)

print(classification_report(y_test, prediccion))

In [None]:
# XGBoost model

# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Fit
xgb_cl.fit(x_train, y_train)

# Predict
prediction = xgb_cl.predict(x_test)

print(classification_report(y_test, prediction))

In [None]:
# kNN model

knn = KNeighborsClassifier(n_neighbors=9)

knn.fit(x_train, y_train)

prediction = knn.predict(x_test)

print(classification_report(y_test, prediction))