# Imports

In [1]:
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
import seaborn as sns 
import random

# check the version of these modules
print(np.__version__)
print(pickle.format_version)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

from tensorflow.keras.preprocessing.sequence import pad_sequences


1.23.4
4.0


2023-06-12 15:31:41.990591: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Loading data 

In [2]:
# load DE features named '1_123.npz'
data_npz = np.load('../data/1_123.npz')
print(data_npz.files)

['data', 'label']


In [3]:
# get data and label
# ** both 'data' and 'label' are pickled dict **

data = pickle.loads(data_npz['data'])
label = pickle.loads(data_npz['label'])

label_dict = {0:'Disgust', 1:'Fear', 2:'Sad', 3:'Neutral', 4:'Happy'}

print(data.keys())
print(label.keys())


np.loads()


dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])


AttributeError: module 'numpy' has no attribute 'loads'

In [None]:
data[0]

In [None]:
# As we can see, there are 45 keys in both 'data' and 'label'.
# Each participant took part in our experiments for 3 sessions, and he/she watched 15 movie clips (i.e. 15 trials) during each session.
# Therefore, we could extract 3 * 15 = 45 DE feature matrices.

# The key indexes [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] belong to Session 1.
# The key indexes [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29] belong to Session 2.
# The key indexes [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44] belong to Session 3.

# We will print the emotion labels for each trial.
label_dict = {0:'Disgust', 1:'Fear', 2:'Sad', 3:'Neutral', 4:'Happy'}
for i in range(45):
    print('Session {} -- Trial {} -- EmotionLabel : {}'.format(i//15+1, i%15+1, label_dict[label[i][0]]))

# Transforming dataset 

## Transform dataset to 2D Data

<u>Here we transform the data in a train and test split, mainly for our baseline model SVM:</u>

In [None]:
#Split Data Function
def splitdata(data, label, ntrainbatch):
    nbatch=ntrainbatch-1
    trainframes=[]
    testframes =[]
    for i in range(45):
        if i%15-1<nbatch:
            trainframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
        if i%15>nbatch:
            testframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
    train = pd.concat(trainframes)
    test  = pd.concat(testframes)
    return train, test


In [None]:
#fulldfsplit() function for Train-Test Split with Full DF
def fulldfsplit(nsubjects=16):
    '''Files must be labelled as {subject#}_123.npz’ and should be inside a Data folder within the Project'''
    data16  = {}
    label16 = {}
    Xytrain16_list = []
    Xytest16_list  = []
    for i in range(1,nsubjects+1):
        # Load all 16 files data into a Dict named ‘i_123.npz’ using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    for i in range(1,nsubjects+1):
        #apply all data to the splitdata func to create lists of DFs
        train, test = splitdata(data16[i], label16[i], 10)
        Xytrain16_list.append(train)
        Xytest16_list.append(test)
    #create a unified DF from every list with pd.concat(trainframes)
    Xytrain16_DF = pd.concat(Xytrain16_list)
    Xytest16_DF  = pd.concat(Xytest16_list)
    return Xytrain16_DF, Xytest16_DF
Xtrain,Xtest = fulldfsplit()
Xtrain.head(50)

<u> Here we concatenate all the trials together as well as add the target into one whole dataframe! </u>

In [None]:
def gatherdata(X, y):
    Xyframes=[]
    for i in range(45):
        Xyframes.append(pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i])], axis=1))
    XyDF = pd.concat(Xyframes)
    return XyDF

In [None]:
#New function to collect all Data across all 16 subjects without split
#Full DF no split
def fulldf(nsubjects=16):
    data16  = {}
    label16 = {}
    for i in range(1,nsubjects+1):
        # Load all 16 files data into a Dict named ‘i_123.npz’ using a for loop
        data16[i]  = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    Xy16_list = []
    for i in range(1,nsubjects+1):
        #apply all data to the gather data func to create lists of DFs
        Xy = gatherdata(data16[i], label16[i])
        Xy16_list.append(Xy)
    XyDF = pd.concat(Xy16_list)
    XyDF.columns = [*XyDF.columns[:-1], 'target']
    return XyDF
Xytotal = fulldf()
Xytotal

In [None]:
Xytotal = Xytotal.drop('target', axis=1)

<u> Here we create an an numpy array, containing the input data to the RNN </u>

In [None]:
def rnn_df(nsubjects=16):
    data16  = [pickle.loads(np.load(f'../data/{i}_123.npz')['data']) for i in range(1,nsubjects+1)]
    return data16

# EDA 


In [None]:
# Rachels notebook EDA 06 June for heatmap 

# Preprocessing of data 

## Basemodel 

In [None]:
Xtrain.columns = [*Xtrain.columns[:-1], 'target']
Xtest.columns = [*Xtest.columns[:-1], 'target']

In [None]:
# splitting test and train
X_test = Xtest.iloc[:, :-1].values
y_test = Xtest.iloc[:, -1].values
X_train = Xtrain.iloc[:, :-1].values
y_train = Xtrain.iloc[:, -1].values

In [None]:
# scaling of the dataset
from sklearn.preprocessing import MinMaxScaler

scaler_M = MinMaxScaler()

scaler_M.fit(X_train)
X_train_minmaxed = scaler_M.transform(X_train)

scaler_M.fit(X_test)
X_test_minmaxed = scaler_M.transform(X_test)

## RNN 

In [None]:
# scaling of the dataset for full: here we take the full take the full data set and fit it on a MinMax scaler 
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)
scaler_com = MinMaxScaler()
scaler_com.fit(Xytotal)
pca.fit(Xytotal)

In [None]:
# here, we load only the ndarray of the experiments values, and transform each trial with the previously fitted scaler of the whole dataset 
ddd = rnn_df()
lst = []
for i in range(16): 
    each_participant = list(ddd[i].values())
    for j in each_participant: 
        sequence_scaled = scaler_com.transform(j)
        pca_sequence = pca.transform(sequence_scaled)
        lst.append(pca_sequence)
# afterwards, we pad the whole dataset, so we have a uniformous input to our dataset - therefore, we can use the RNN 
X_pad = pad_sequences(lst, value=-42069, padding="post", dtype='float32') # int32 by default
X_pad.shape

X_pad[0]


In [None]:
# y value needs to be one hot encoded for the RNN: therefore, we first take the single value of each trial to a an ndarray and reshape it 
yunique = []
nsubjects=16
for i in range(1,nsubjects+1):
    y=pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    for e in range(45):
        yunique.append(int(np.unique(y[e])))
y = np.array(yunique)
y

y_re = y.reshape(-1, 1)
y_re

In [None]:
# afterwards we OHE the target 
from sklearn.preprocessing import OneHotEncoder
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse = False) 
# Fit encoder
ohe.fit(y_re) 
y_OHE = ohe.transform(y_re)


## ARCHIVE: PCA - we do not use PCA, but maybe for future reference 

In [None]:
from sklearn.decomposition import PCA

#pca = PCA(n_components=0.9)
#pca.fit(X_full_scaled)

In [None]:
W = pca.components_

# Print PCs as COLUMNS
W = pd.DataFrame(W.T,
                 columns=[f'PC{i}' for i in range(1, 12 )])

W

In [None]:
pca.explained_variance_ratio_


In [None]:
pca.explained_variance_ratio_

plt.plot(pca.explained_variance_ratio_)
plt.xlabel('Principal Component'); plt.ylabel('% explained variance');

In [None]:
# check info about dataset 

# check class distribution of emotion in large data set 

# create feature analysis: checking for outliers, distribution of numerical features using histograms or boxplots, 

# correlation analysis between the independent variables 

# create visualisations to gain further insight 

# Modelling 

## Baseline Score

In [None]:
# as the dataset is balanced and each emotion category is equally shown 
unique_y = len(np.unique(y_train))
baseline_score = 1 / unique_y 
baseline_score

## Baseline Model: SVM 

In [None]:
# Define the parameter grid
param_grid = {
    'kernel': ['rbf'],
    'C': [10]
}

# Create the SVC classifier
svc = SVC()

# Create the GridSearchCV object
grid_search = GridSearchCV(svc, param_grid, cv=5)

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train_minmaxed, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test_minmaxed)

# Evaluate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Best Accuracy:", accuracy)
print("Best Hyperparameters:", best_params)

# Best Accuracy: 0.6376201923076923
# Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}

## Random Forest Model (if we still have time) 

## RNN 

In [None]:
# input X and y for the RNN, plus input shape 
input_shape = X_pad.shape[1:]
X_rnn = X_pad
y_rnn = y_OHE 


random_list = list(range(720))
train = []
validation = []
test = []
for i in range(720): 
    val = random.choice(random_list) #520 
    random_list = [x for x in random_list if x != val] # 719 
    if len(random_list) > 216: 
        train.append(X_rnn[val])
    if len (random_list) < 216 and len (random_list) > 36: 
        validation.append(X_rnn[val]) 
    else: 
        test.append(X_rnn[val])

np.array(test).shape

In [None]:
len(random_list)
random_list

## RNN Val_accuracy of up to 81% after applying PCA  

In [None]:
#1. Model Architecture
model = Sequential()
model.add(layers.Masking(mask_value=-42069.,input_shape=input_shape))

model.add(layers.LSTM(units=20, activation='tanh', return_sequences=True))
model.add(layers.LSTM(units=20, activation='tanh', return_sequences=True))
#model.add(layers.LSTM(units=10, activation='tanh', return_sequences=True))

model.add(layers.LSTM(units=20, activation='tanh', return_sequences=False))

#model.add(layers.Dense(20, activation='relu'))
#model.add(layers.Dense(20, activation='relu'))
model.add(layers.Dense(5, activation='softmax'))

optimizer = AdamW(learning_rate=0.005, 
                  #weight_decay=0.1, 
                  beta_1=0.9)

# 2. Model Compilation
model.compile(
    loss='categorical_crossentropy', 
    optimizer=optimizer,
    metrics=['accuracy'])
    
# –– Fit
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs = 200,         # Notice that we are not using any Early Stopping Criterion
    batch_size = 32, 
    verbose=2
)




# Data Storytelling

## How to make a nice presentation? 