In [3]:
# Imports
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from pyeeg import *
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import tsfresh
import pandas as pd
from utils.split import split_features_and_labels

In [15]:
# Things associated with the GAMEEMO dataset
electrodes = 'AF3 AF4 F3 F4 F7 F8 FC5 FC6 O1 O2 P7 P8 T7 T8'.split(' ')
sample_rate = 128
bands = [0.5, 4, 7, 12, 30, 50]

In [5]:
# In summary, the resulting features are an array containing raw EEG data split into 5-second intervals. 
# (dimensionality of n subjects*n trials*slice_amt x n_electrodes x 128*5), which I'll call batch size.
# and the labels are an array containing valence and arousal for each 5-second interval.
# (dimensionality of n subjects*n trials*slice_amt x 2)

eeg_splitted_features = np.load('gameemo_features.npy')
eeg_splitted_labels = np.load('gameemo_labels.npy')
print(eeg_splitted_features.shape)

(112, 14, 38252)


In [6]:
eeg_splitted_features, eeg_splitted_labels = split_features_and_labels(eeg_splitted_features, eeg_splitted_labels, sample_rate, 5)

drop_locs = np.any(eeg_splitted_labels <0, axis=-1)
eeg_splitted_features = eeg_splitted_features[np.logical_not(drop_locs)]
eeg_splitted_labels = eeg_splitted_labels[np.logical_not(drop_locs)]

In [7]:
# Labels are originally scaled between 1 and 5. We transform them between 0 and 1.
scaled_labels = eeg_splitted_labels
#scaled_labels = MinMaxScaler().fit_transform(eeg_splitted_labels)
valence_labels = scaled_labels[:, 0]
arousal_labels = scaled_labels[:, 1]

In [8]:

# Extract power bands; we first flatten the first two dimensions to obtain a 2D array of voltage timeseries, we calculate band power for each electrode separately, then get the electrode dimension back. This gives us a dimensionality of (batch size x n electrodes x n bands)
reshaped_features = np.reshape(eeg_splitted_features, [eeg_splitted_features.shape[0]*eeg_splitted_features.shape[1], eeg_splitted_features.shape[2]])
extracted_powers = np.asarray([bin_power(X=feature, Band=bands,  Fs=sample_rate) for feature in reshaped_features])[:, 0, :]
extracted_powers = np.reshape(extracted_powers, [eeg_splitted_features.shape[0], eeg_splitted_features.shape[1], extracted_powers.shape[1]])
print(extracted_powers.shape)


(6549, 14, 5)


In [25]:

feature_names = np.asarray([[str(bands[x]) + '-' +str(bands[x+1]) + '|' + e for x in range(len(bands)-1)] for e in electrodes], dtype='object')
feature_names = np.ravel(feature_names)
print(feature_names)

['0.5-4|AF3' '4-7|AF3' '7-12|AF3' '12-30|AF3' '30-50|AF3' '0.5-4|AF4'
 '4-7|AF4' '7-12|AF4' '12-30|AF4' '30-50|AF4' '0.5-4|F3' '4-7|F3'
 '7-12|F3' '12-30|F3' '30-50|F3' '0.5-4|F4' '4-7|F4' '7-12|F4' '12-30|F4'
 '30-50|F4' '0.5-4|F7' '4-7|F7' '7-12|F7' '12-30|F7' '30-50|F7' '0.5-4|F8'
 '4-7|F8' '7-12|F8' '12-30|F8' '30-50|F8' '0.5-4|FC5' '4-7|FC5' '7-12|FC5'
 '12-30|FC5' '30-50|FC5' '0.5-4|FC6' '4-7|FC6' '7-12|FC6' '12-30|FC6'
 '30-50|FC6' '0.5-4|O1' '4-7|O1' '7-12|O1' '12-30|O1' '30-50|O1'
 '0.5-4|O2' '4-7|O2' '7-12|O2' '12-30|O2' '30-50|O2' '0.5-4|P7' '4-7|P7'
 '7-12|P7' '12-30|P7' '30-50|P7' '0.5-4|P8' '4-7|P8' '7-12|P8' '12-30|P8'
 '30-50|P8' '0.5-4|T7' '4-7|T7' '7-12|T7' '12-30|T7' '30-50|T7' '0.5-4|T8'
 '4-7|T8' '7-12|T8' '12-30|T8' '30-50|T8']


In [None]:
#extracted_entropy = np.asarray([dfa(feature) for feature in reshaped_features])
#print(extracted_entropy.shape)

In [10]:
# Then we flatten the last two dimensions, leaving some number of bands per electrode as the final feature for each data point.
final_features = np.reshape(extracted_powers, [extracted_powers.shape[0], extracted_powers.shape[1]*extracted_powers.shape[2]])

In [11]:
# We do k-fold cross validation, meaning that we hold out 10% of the data, train on the other 90%, and evaluate the model performance on the original 10%.
# We repeat this process k times. 
# As we can see, there is some overfitting as the train R^2 is much higher than the test R^2.
kf = KFold(n_splits=10, shuffle=True)
i = 0
importances =[]
all_predictions_arousal = np.zeros((final_features.shape[0], ))
for train_index, test_index in kf.split(final_features):
    x_train = final_features[train_index]
    y_train = arousal_labels[train_index]
    x_test = final_features[test_index]
    y_test = arousal_labels[test_index]

    xgb =  XGBRegressor()
    xgb.fit(x_train, y_train)
    

    print('Fold number ' + str(i))
    y_pred = xgb.predict(x_train)
  #  print('Train R^2: ' + str(r2_score(y_train, y_pred)))
    y_pred = xgb.predict(x_test)
  #  print('Test R^2: ' + str(r2_score(y_test, y_pred)))
    importances.append(xgb.feature_importances_)
    all_predictions_arousal[test_index] = y_pred
    i += 1


Fold number 0
Fold number 1
Fold number 2
Fold number 3
Fold number 4
Fold number 5
Fold number 6
Fold number 7
Fold number 8
Fold number 9


TypeError: r2_score() missing 1 required positional argument: 'y_pred'

In [12]:
print(r2_score(arousal_labels,all_predictions_arousal))

0.6362681425062975


In [28]:
df = pd.DataFrame()
df['feature_name'] = feature_names
df['mean_importances'] = np.sum(np.asarray(importances), axis=0)
df['electrode'] = [x.split('|')[1] for x in df['feature_name']]
df.groupby('electrode').mean().sort_values('mean_importances', ascending=False)
#df = df.sort_values('mean_importances', ascending=False)
#df


Unnamed: 0_level_0,mean_importances
electrode,Unnamed: 1_level_1
O2,0.202465
T7,0.193444
F7,0.178142
O1,0.161607
F8,0.161147
P7,0.156777
FC5,0.149632
FC6,0.140386
AF3,0.136241
T8,0.130562


# A plot of the label space, the valence and arousal.
# The darker the blue, the more total labels are stacked on top of that point.

plt.figure(figsize=(10, 10))
plt.scatter(scaled_labels[:, 0], scaled_labels[:, 1], alpha=0.005)
plt.xlabel('Valence')
plt.ylabel('Arousal')

plt.figure(figsize=(10, 10))

plt.scatter(arousal_labels, np.expand_dims(all_predictions_arousal,axis=-1), alpha=0.02)
plt.xlabel('True arousal')
plt.ylabel('Predicted arousal')