# Import data

In [1]:
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt

# check the version of these modules
print(np.__version__)
print(pickle.format_version)


1.23.4
4.0


In [2]:
# load DE features named '1_123.npz'
data_npz = np.load('../data/1_123.npz')
print(data_npz.files)

['data', 'label']


In [3]:
# get data and label
# ** both 'data' and 'label' are pickled dict **

data = pickle.loads(data_npz['data'])
label = pickle.loads(data_npz['label'])

label_dict = {0: 'Disgust', 1: 'Fear', 2: 'Sad', 3: 'Neutral', 4: 'Happy'}

print(data.keys())
print(label.keys())


dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])


In [4]:
# load DE features named '1_123.npz'
data_npz_16 = np.load('../data/16_123.npz')
print(data_npz_16.files)

# get data and label
# ** both 'data' and 'label' are pickled dict **

data_16 = pickle.loads(data_npz_16['data'])
label_16 = pickle.loads(data_npz_16['label'])

label_dict = {0: 'Disgust', 1: 'Fear', 2: 'Sad', 3: 'Neutral', 4: 'Happy'}

print(data_16.keys())
print(label_16.keys())

['data', 'label']
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44])


## Merging all 16 files into a single dataframe

In [5]:
def splitdata(data, label, ntrainbatch):
    nbatch = ntrainbatch - 1
    trainframes = []
    testframes = []
    for i in range(45):
        if i % 15 - 1 < nbatch:
            trainframes.append(
                pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i], columns=['emotion_label'])], axis=1))
        if i % 15 > nbatch:
            testframes.append(
                pd.concat([pd.DataFrame(data[i]), pd.DataFrame(label[i], columns=['emotion_label'])], axis=1))
    train = pd.concat(trainframes)
    test = pd.concat(testframes)
    return train, test


def fulldf(nsubjects=16):
    data16 = {}
    label16 = {}
    Xytrain16_list = []
    Xytest16_list = []
    for i in range(1, nsubjects+1):
        data16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['data'])
        label16[i] = pickle.loads(np.load(f'../data/{i}_123.npz')['label'])
    for i in range(1, 17):
        X, y = splitdata(data16[i], label16[i], 10)
        Xytrain16_list.append(X)
        Xytest16_list.append(y)
    Xytrain16_DF = pd.concat(Xytrain16_list, keys=[
                             f'data{i}' for i in range(1, 17)])
    Xytest16_DF = pd.concat(Xytest16_list, keys=[
                            f'data{i}' for i in range(1, 17)])
    return Xytrain16_DF, Xytest16_DF


Xytrain16_DF, Xytest16_DF = fulldf()

In [6]:
Xytrain16_DF


Unnamed: 0,Unnamed: 1,0,1,2,3,4,5,6,7,8,9,...,301,302,303,304,305,306,307,308,309,emotion_label
data1,0,11.082522,8.915990,7.894088,8.393629,8.576055,10.450283,8.682803,7.425018,6.833982,6.431147,...,7.484101,6.889748,5.478040,4.247615,8.838729,7.736448,7.071955,5.657081,4.361506,4.0
data1,1,11.081816,8.915104,7.893646,8.393852,8.575908,10.449770,8.681773,7.424501,6.833948,6.430860,...,7.483864,6.889452,5.477954,4.247919,8.838477,7.736124,7.071776,5.656987,4.361624,4.0
data1,2,11.081139,8.914245,7.893116,8.394334,8.575671,10.449391,8.680753,7.424007,6.834211,6.430407,...,7.483311,6.888881,5.477563,4.248155,8.838380,7.735444,7.071462,5.656746,4.361780,4.0
data1,3,11.080857,8.913749,7.893061,8.394893,8.575294,10.449425,8.680200,7.424111,6.834612,6.429797,...,7.482540,6.887955,5.476832,4.248294,8.838502,7.734421,7.070929,5.656415,4.362126,4.0
data1,4,11.081297,8.913816,7.893386,8.395509,8.575009,10.449813,8.680219,7.424706,6.835073,6.429158,...,7.481893,6.886515,5.475891,4.248291,8.839146,7.733523,7.069984,5.656045,4.362404,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
data16,66,10.417180,8.864372,7.691342,9.950288,9.345040,10.578348,9.099482,8.039421,10.086082,9.358139,...,7.946895,6.709683,5.413670,3.931630,9.183664,7.950299,6.711681,5.416452,3.940586,1.0
data16,67,10.414650,8.863123,7.689675,9.951519,9.345441,10.575614,9.097878,8.037844,10.086540,9.358437,...,7.947575,6.709925,5.413248,3.930947,9.183818,7.950983,6.711942,5.416054,3.939858,1.0
data16,68,10.412856,8.862420,7.688383,9.952600,9.345814,10.573764,9.096836,8.036566,10.086782,9.358630,...,7.948361,6.710299,5.413007,3.930544,9.183925,7.951765,6.712333,5.415831,3.939416,1.0
data16,69,10.411692,8.861999,7.687442,9.953487,9.346075,10.572644,9.096131,8.035535,10.086850,9.358650,...,7.949014,6.710540,5.412962,3.930273,9.184343,7.952411,6.712577,5.415793,3.939127,1.0


In [10]:
Xytrain16_DF[0]


data1   0     11.082522
        1     11.081816
        2     11.081139
        3     11.080857
        4     11.081297
                ...    
data16  66    10.417180
        67    10.414650
        68    10.412856
        69    10.411692
        70    10.410973
Name: 0, Length: 19184, dtype: float64

#### check missing data

In [7]:
has_missing_values = Xytrain16_DF.isnull().values.any()
if has_missing_values:
    print("The data has missing values.")
else:
    print("The data does not have missing values.")

# for i in range(45):
has_missing_values = Xytest16_DF.isnull().values.any()
if has_missing_values:
    print("The data has missing values.")
else:
    print("The data does not have missing values.")

The data does not have missing values.
The data does not have missing values.


#### splitting data

In [8]:
# splitting test and train
X_test = Xytest16_DF.iloc[:, :-1].values
y_test = Xytest16_DF.iloc[:, -1].values
X_train = Xytrain16_DF.iloc[:, :-1].values
y_train = Xytrain16_DF.iloc[:, -1].values

# Preprocessing data with two different scalers

In [None]:
# scaling of the dataset from Dataframe to Numpy
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler

scaler_S = StandardScaler()
X_train_standardized = scaler_S.fit_transform(X_train)
X_test_standardized = scaler_S.fit_transform(X_test)

scaler_R = RobustScaler()
X_train_robusted = scaler_R.fit_transform(X_train)
X_test_robusted = scaler_R.fit_transform(X_test)

scaler_M = MinMaxScaler()
X_train_minmaxed = scaler_M.fit_transform(X_train)
X_test_minmaxed = scaler_M.fit_transform(X_test)

# Training Data

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

Optional params (pc will take a while to compute)

param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10, 100]
}

#### with Robust Scaler

In [None]:
# Define the parameter grid
param_grid = {
    'kernel': ['rbf'],
    'C': [10]
}

# Create the SVC classifier
svc = SVC()

# Create the GridSearchCV object
grid_search = GridSearchCV(svc, param_grid, cv=5)

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train_robusted, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test_robusted)

# Evaluate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Best Accuracy:", accuracy)
print("Best Hyperparameters:", best_params)

# Best Accuracy: 0.6042668269230769
# Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}

#### with MinMax Scaler

In [None]:
# Define the parameter grid
param_grid = {
    'kernel': ['rbf'],
    'C': [10]
}

# Create the SVC classifier
svc = SVC()

# Create the GridSearchCV object
grid_search = GridSearchCV(svc, param_grid, cv=5)

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train_minmaxed, y_train)

# Get the best hyperparameters and the corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test_minmaxed)

# Evaluate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Best Accuracy:", accuracy)
print("Best Hyperparameters:", best_params)

# Best Accuracy: 0.6376201923076923
# Best Hyperparameters: {'C': 10, 'kernel': 'rbf'}