# Imports

In [21]:
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
import seaborn as sns 

# check the version of these modules
print(np.__version__)
print(pickle.format_version)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

from tensorflow.keras.preprocessing.sequence import pad_sequences


1.23.4
4.0


2023-06-08 09:59:26.925809: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Loading data 

In [2]:
# load DE features named '1_123.npz'
data_npz = np.load('../data/1_123.npz')
print(data_npz.files)

['data', 'label']


In [3]:
# get data and label
# ** both 'data' and 'label' are pickled dict **

data = pickle.loads(data_npz['data'])
label = pickle.loads(data_npz['label'])

label_dict = {0:'Disgust', 1:'Fear', 2:'Sad', 3:'Neutral', 4:'Happy'}

print(data.keys())
print(label.keys())

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])


In [4]:
# As we can see, there are 45 keys in both 'data' and 'label'.
# Each participant took part in our experiments for 3 sessions, and he/she watched 15 movie clips (i.e. 15 trials) during each session.
# Therefore, we could extract 3 * 15 = 45 DE feature matrices.

# The key indexes [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] belong to Session 1.
# The key indexes [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] belong to Session 2.
# The key indexes [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44] belong to Session 3.

# We will print the emotion labels for each trial.
label_dict = {0:'Disgust', 1:'Fear', 2:'Sad', 3:'Neutral', 4:'Happy'}
for i in range(45):
    print('Session {} -- Trial {} -- EmotionLabel : {}'.format(i//15+1, i%15+1, label_dict[label[i][0]]))

Session 1 -- Trial 1 -- EmotionLabel : Happy
Session 1 -- Trial 2 -- EmotionLabel : Fear
Session 1 -- Trial 3 -- EmotionLabel : Neutral
Session 1 -- Trial 4 -- EmotionLabel : Sad
Session 1 -- Trial 5 -- EmotionLabel : Disgust
Session 1 -- Trial 6 -- EmotionLabel : Happy
Session 1 -- Trial 7 -- EmotionLabel : Fear
Session 1 -- Trial 8 -- EmotionLabel : Neutral
Session 1 -- Trial 9 -- EmotionLabel : Sad
Session 1 -- Trial 10 -- EmotionLabel : Disgust
Session 1 -- Trial 11 -- EmotionLabel : Happy
Session 1 -- Trial 12 -- EmotionLabel : Fear
Session 1 -- Trial 13 -- EmotionLabel : Neutral
Session 1 -- Trial 14 -- EmotionLabel : Sad
Session 1 -- Trial 15 -- EmotionLabel : Disgust
Session 2 -- Trial 1 -- EmotionLabel : Sad
Session 2 -- Trial 2 -- EmotionLabel : Fear
Session 2 -- Trial 3 -- EmotionLabel : Neutral
Session 2 -- Trial 4 -- EmotionLabel : Disgust
Session 2 -- Trial 5 -- EmotionLabel : Happy
Session 2 -- Trial 6 -- EmotionLabel : Happy
Session 2 -- Trial 7 -- EmotionLabel : Disgus

# Transforming dataset 

## Transform dataset to 2D Data

<u>Here we transform the data in a train and test split, mainly for our baseline model SVM:</u>

In [5]:
#Split Data Function
def splitdata(data, label, ntrainbatch):
    nbatch=ntrainbatch-1
    trainframes=[]
    testframes =[]
    for i in range(45):
        if i%15-1<nbatch:
            trainframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
        if i%15>nbatch:
            testframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
    train = pd.concat(trainframes)
    test  = pd.concat(testframes)
    return train, test


In [6]:
#fulldfsplit() function for Train-Test Split with Full DF
def fulldfsplit(nsubjects=16):
    '''Files must be labelled as {subject#}_123.npz’ and should be inside a Data folder within the Project'''
    data16  = {}
    label16 = {}
    Xytrain16_list = []
    Xytest16_list  = []
    for i in range(1,nsubjects+1):
        # Load all 16 files data into a Dict named ‘i_123.npz’ using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    for i in range(1,nsubjects+1):
        #apply all data to the splitdata func to create lists of DFs
        train, test = splitdata(data16[i], label16[i], 10)
        Xytrain16_list.append(train)
        Xytest16_list.append(test)
    #create a unified DF from every list with pd.concat(trainframes)
    Xytrain16_DF = pd.concat(Xytrain16_list)
    Xytest16_DF  = pd.concat(Xytest16_list)
    return Xytrain16_DF, Xytest16_DF
Xtrain,Xtest = fulldfsplit()
Xtrain.head(50)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,301,302,303,304,305,306,307,308,309,0.1
0,11.082522,8.91599,7.894088,8.393629,8.576055,10.450283,8.682803,7.425018,6.833982,6.431147,...,7.484101,6.889748,5.47804,4.247615,8.838729,7.736448,7.071955,5.657081,4.361506,4.0
1,11.081816,8.915104,7.893646,8.393852,8.575908,10.44977,8.681773,7.424501,6.833948,6.43086,...,7.483864,6.889452,5.477954,4.247919,8.838477,7.736124,7.071776,5.656987,4.361624,4.0
2,11.081139,8.914245,7.893116,8.394334,8.575671,10.449391,8.680753,7.424007,6.834211,6.430407,...,7.483311,6.888881,5.477563,4.248155,8.83838,7.735444,7.071462,5.656746,4.36178,4.0
3,11.080857,8.913749,7.893061,8.394893,8.575294,10.449425,8.6802,7.424111,6.834612,6.429797,...,7.48254,6.887955,5.476832,4.248294,8.838502,7.734421,7.070929,5.656415,4.362126,4.0
4,11.081297,8.913816,7.893386,8.395509,8.575009,10.449813,8.680219,7.424706,6.835073,6.429158,...,7.481893,6.886515,5.475891,4.248291,8.839146,7.733523,7.069984,5.656045,4.362404,4.0
5,11.082385,8.914133,7.893591,8.395986,8.574627,10.45064,8.680702,7.425287,6.835468,6.428375,...,7.481358,6.884889,5.474986,4.248107,8.840121,7.732843,7.068976,5.655815,4.362619,4.0
6,11.083825,8.914366,7.893565,8.39613,8.573922,10.451684,8.681255,7.42567,6.835498,6.427334,...,7.480719,6.883047,5.474118,4.247806,8.841397,7.73229,7.067766,5.655584,4.36283,4.0
7,11.085158,8.914192,7.892964,8.395734,8.572777,10.452605,8.681552,7.42542,6.834936,6.425751,...,7.480014,6.881331,5.473169,4.247491,8.842948,7.731709,7.066447,5.655036,4.363043,4.0
8,11.085906,8.91345,7.891825,8.394749,8.571212,10.452949,8.681341,7.42456,6.833846,6.423801,...,7.479337,6.879698,5.472189,4.247235,8.844387,7.731218,7.0651,5.654262,4.363255,4.0
9,11.086201,8.911913,7.890041,8.393327,8.569228,10.45264,8.680249,7.422915,6.832204,6.421476,...,7.478602,6.877927,5.471179,4.24717,8.845474,7.730684,7.063503,5.653381,4.363512,4.0


<u> Here we concatenate all the trials together as well as add the target into one whole dataframe! </u>

In [7]:
def gatherdata(X, y):
    Xyframes=[]
    for i in range(45):
        Xyframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
    XyDF = pd.concat(Xyframes)
    return XyDF

In [12]:
#New function to collect all Data across all 16 subjects without split
#Full DF no split
def fulldf(nsubjects=16):
    data16  = {}
    label16 = {}
    for i in range(1,nsubjects+1):
        # Load all 16 files data into a Dict named ‘i_123.npz’ using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    Xy16_list = []
    for i in range(1,nsubjects+1):
        #apply all data to the gather data func to create lists of DFs
        Xy = gatherdata(data16[i], label16[i])
        Xy16_list.append(Xy)
    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    return XyDF
Xytotal = fulldf()
Xytotal

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,301,302,303,304,305,306,307,308,309,target
0,11.082522,8.915990,7.894088,8.393629,8.576055,10.450283,8.682803,7.425018,6.833982,6.431147,...,7.484101,6.889748,5.478040,4.247615,8.838729,7.736448,7.071955,5.657081,4.361506,4.0
1,11.081816,8.915104,7.893646,8.393852,8.575908,10.449770,8.681773,7.424501,6.833948,6.430860,...,7.483864,6.889452,5.477954,4.247919,8.838477,7.736124,7.071776,5.656987,4.361624,4.0
2,11.081139,8.914245,7.893116,8.394334,8.575671,10.449391,8.680753,7.424007,6.834211,6.430407,...,7.483311,6.888881,5.477563,4.248155,8.838380,7.735444,7.071462,5.656746,4.361780,4.0
3,11.080857,8.913749,7.893061,8.394893,8.575294,10.449425,8.680200,7.424111,6.834612,6.429797,...,7.482540,6.887955,5.476832,4.248294,8.838502,7.734421,7.070929,5.656415,4.362126,4.0
4,11.081297,8.913816,7.893386,8.395509,8.575009,10.449813,8.680219,7.424706,6.835073,6.429158,...,7.481893,6.886515,5.475891,4.248291,8.839146,7.733523,7.069984,5.656045,4.362404,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39,9.865708,8.181951,6.968030,7.457957,7.581654,9.842531,8.273131,6.920100,7.634970,8.088547,...,7.113831,6.602746,5.948172,5.670912,8.973436,7.394573,6.659146,5.869473,5.465647,0.0
40,9.873385,8.184243,6.968518,7.458047,7.581070,9.851169,8.275693,6.921697,7.637877,8.090925,...,7.114508,6.604563,5.949810,5.674434,8.973164,7.395748,6.659279,5.869647,5.467477,0.0
41,9.878899,8.185655,6.968883,7.458021,7.580267,9.857436,8.277476,6.923256,7.640097,8.092470,...,7.115406,6.605937,5.950904,5.676970,8.972556,7.396990,6.659370,5.869539,5.468438,0.0
42,9.882484,8.186410,6.969143,7.457973,7.579572,9.861518,8.278689,6.924505,7.641712,8.093218,...,7.116213,6.606916,5.951629,5.678651,8.971836,7.398021,6.659404,5.869355,5.468909,0.0


In [13]:
Xytotal = Xytotal.drop('target', axis=1)

<u> Here we create an an numpy array, containing the input data to the RNN </u>

In [14]:
def rnn_df(nsubjects=16):
    data16  = [pickle.loads(np.load(f'../data/{i}_123.npz')['data']) for i in range(1,nsubjects+1)]
    return data16

# EDA 


In [None]:
# Rachels notebook EDA 06 June for heatmap 

# Preprocessing of data 

## Basemodel 

In [15]:
Xtrain.columns = [*Xtrain.columns[:-1], 'target']
Xtest.columns = [*Xtest.columns[:-1], 'target']

In [16]:
# splitting test and train
X_test = Xtest.iloc[:, :-1].values
y_test = Xtest.iloc[:, -1].values
X_train = Xtrain.iloc[:, :-1].values
y_train = Xtrain.iloc[:, -1].values

In [17]:
# scaling of the dataset
from sklearn.preprocessing import MinMaxScaler

scaler_M = MinMaxScaler()

scaler_M.fit(X_train)
X_train_minmaxed = scaler_M.transform(X_train)

scaler_M.fit(X_test)
X_test_minmaxed = scaler_M.transform(X_test)

## RNN 

In [18]:
# scaling of the dataset for full: here we take the full take the full data set and fit it on a MinMax scaler 
scaler_com = MinMaxScaler()
scaler_com.fit(Xytotal)

In [22]:
# here, we load only the ndarray of the experiments values, and transform each trial with the previously fitted scaler of the whole dataset 
ddd = rnn_df()
lst = []
for i in range(16): 
    each_participant = list(ddd[i].values())
    for j in each_participant: 
        sequence_scaled = scaler_com.transform(j)
        lst.append(sequence_scaled)
# afterwards, we pad the whole dataset, so we have a uniformous input to our dataset - therefore, we can use the RNN 
X_pad = pad_sequences(lst, value=-42069, padding="post", dtype='float32') # int32 by default
X_pad


array([[[ 9.6168005e-01,  8.6337322e-01,  1.0000000e+00, ...,
          9.6979851e-01,  8.1723666e-01,  5.6124020e-01],
        [ 9.6134311e-01,  8.6279356e-01,  9.9968964e-01, ...,
          9.6964586e-01,  8.1715584e-01,  5.6128687e-01],
        [ 9.6101934e-01,  8.6223119e-01,  9.9931687e-01, ...,
          9.6937764e-01,  8.1694877e-01,  5.6134856e-01],
        ...,
        [-4.2069000e+04, -4.2069000e+04, -4.2069000e+04, ...,
         -4.2069000e+04, -4.2069000e+04, -4.2069000e+04],
        [-4.2069000e+04, -4.2069000e+04, -4.2069000e+04, ...,
         -4.2069000e+04, -4.2069000e+04, -4.2069000e+04],
        [-4.2069000e+04, -4.2069000e+04, -4.2069000e+04, ...,
         -4.2069000e+04, -4.2069000e+04, -4.2069000e+04]],

       [[ 7.0245230e-01,  7.4099392e-01,  7.7079725e-01, ...,
          6.0812610e-01,  5.5691975e-01,  4.2503238e-01],
        [ 7.0197886e-01,  7.4065709e-01,  7.7060425e-01, ...,
          6.0804862e-01,  5.5691218e-01,  4.2506051e-01],
        [ 7.0092267e-01, 

In [23]:
# y value needs to be one hot encoded for the RNN: therefore, we first take the single value of each trial to a an ndarray and reshape it 
yunique = []
nsubjects=16
for i in range(1,nsubjects+1):
    y=pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    for e in range(45):
        yunique.append(int(np.unique(y[e])))
y = np.array(yunique)
y

y_re = y.reshape(-1, 1)
y_re

array([[4],
       [1],
       [3],
       [2],
       [0],
       [4],
       [1],
       [3],
       [2],
       [0],
       [4],
       [1],
       [3],
       [2],
       [0],
       [2],
       [1],
       [3],
       [0],
       [4],
       [4],
       [0],
       [3],
       [2],
       [1],
       [3],
       [4],
       [1],
       [2],
       [0],
       [2],
       [1],
       [3],
       [0],
       [4],
       [4],
       [0],
       [3],
       [2],
       [1],
       [3],
       [4],
       [1],
       [2],
       [0],
       [4],
       [1],
       [3],
       [2],
       [0],
       [4],
       [1],
       [3],
       [2],
       [0],
       [4],
       [1],
       [3],
       [2],
       [0],
       [2],
       [1],
       [3],
       [0],
       [4],
       [4],
       [0],
       [3],
       [2],
       [1],
       [3],
       [4],
       [1],
       [2],
       [0],
       [2],
       [1],
       [3],
       [0],
       [4],
       [4],
       [0],
       [3],
    

In [24]:
# afterwards we OHE the target 
from sklearn.preprocessing import OneHotEncoder
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse = False) 
# Fit encoder
ohe.fit(y_re) 
y_OHE = ohe.transform(y_re)




## ARCHIVE: PCA - we do not use PCA, but maybe for future reference 

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)
pca.fit(X_full_scaled)

In [None]:
W = pca.components_

# Print PCs as COLUMNS
W = pd.DataFrame(W.T,
                 columns=[f'PC{i}' for i in range(1, 12 )])

W

In [None]:
pca.explained_variance_ratio_


In [None]:
pca.explained_variance_ratio_

plt.plot(pca.explained_variance_ratio_)
plt.xlabel('Principal Component'); plt.ylabel('% explained variance');

In [None]:
# check info about dataset 

# check class distribution of emotion in large data set 

# create feature analysis: checking for outliers, distribution of numerical features using histograms or boxplots, 

# correlation analysis between the independent variables 

# create visualisations to gain further insight 

# Modelling 

## Baseline Score

In [None]:
# as the dataset is balanced and each emotion category is equally shown 
unique_y = len(y_train.unique())
baseline_score = 1 / unique_y 
baseline_score

## Baseline Model: SVM 

In [None]:
# Define the parameter grid
param_grid = {
    'kernel': ['rbf'],
    'C': [10]
}

# Create the SVC classifier
svc = SVC()

# Create the GridSearchCV object
grid_search = GridSearchCV(svc, param_grid, cv=5)

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train_minmaxed, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test_minmaxed)

# Evaluate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Best Accuracy:", accuracy)
print("Best Hyperparameters:", best_params)

# Best Accuracy: 0.6376201923076923
# Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}

## Random Forest Model (if we still have time) 

## RNN 

In [None]:
# input X and y for the RNN, plus input shape 
input_shape = X_pad.shape[1:]
X_rnn = X_pad
y_rnn = y_OHE 

In [None]:
# architecture of the RNN 
from tensorflow.keras.layers import Masking
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers

#1. Model Architecture
model = models.Sequential()
model.add(layers.Masking(mask_value=-42069., input_shape=input_shape))
model.add(layers.SimpleRNN(units=2, activation='tanh'))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))

# 2. Model Compilation
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])
    
# –– Fit
history = model.fit(
    X_rnn,
    y_rnn,
    epochs = 500,         # Notice that we are not using any Early Stopping Criterion
    batch_size = 32, 
    verbose=0
)

