In [33]:
import detection
from detection import SubjectData
import utils
from utils import train_subjects, test_subjects
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.metrics import roc_curve, auc, make_scorer
from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
# Run this cell when you modify the detection or utils module.
# This will reload the modules so you don't have to restart the kernel.
import importlib
importlib.reload(detection)
importlib.reload(utils)

<module 'utils' from '/home/nikolas-spyropoulos/Documents/uni/8_semester/e-health/inria-bci-challenge/utils.py'>

In [2]:
RAND_STATE = 42

In [3]:
labels = pd.read_csv('data/TrainLabels.csv')
y = labels['Prediction'].to_numpy()
print(y.shape)

(5440,)


In [40]:
test_set = []
for i in range(len(test_subjects)):
    sd = SubjectData(test_subjects[i], train=False)
    test_set.append(sd.errp_features)

test_set = np.array(test_set)
test_set = test_set.reshape(test_set.shape[0]*test_set.shape[1], test_set.shape[2])
print(f'test_set: {test_set.shape}')

test_set: (3400, 80)


In [4]:
X = []
for i in range(len(train_subjects)):
    sd = SubjectData(train_subjects[i])
    X.append(sd.errp_features)

In [6]:
X = np.array(X)
X = X.reshape(X.shape[0]*X.shape[1], X.shape[2])
X.shape

(5440, 80)

ErrP features shape: (n_samples, n_features)

n_features: ch1_check, ch1_mean, ch1_peak, ch1_latency, ch1_amplitude, ch2_check, ch2_mean, ..., ch16_amplitude

Idea is to get the mean for each feature across all channels

Check will be 1 if most checks are 1 otherwise 0


In [41]:
def transform_errp_data(X, verbose=False):
    X_reshaped = X.reshape(X.shape[0], 16, 5)
    if verbose: print(f'X_reshaped: {X_reshaped.shape}')

    # Apply majority vote: 1 if sum > 8, else 0
    check = X_reshaped[:, :, 0]  # shape (n_samples, 16)
    majority_check = (check.sum(axis=1) > 8).astype(int)  # shape (n_samples,)

    mean_mean = X_reshaped[:, :, 1].mean(axis=1)
    mean_peak = X_reshaped[:, :, 2].mean(axis=1)
    mean_latency = X_reshaped[:, :, 3].mean(axis=1)
    mean_amplitude = X_reshaped[:, :, 4].mean(axis=1)

    final_X = np.stack([majority_check, mean_mean, mean_peak, mean_latency, mean_amplitude], axis=1)
    if verbose: print(f'final_X: {final_X.shape}')

    return final_X

In [None]:
final_X = transform_errp_data(X, verbose=True)

X_reshaped: (5440, 16, 5)
final_X: (5440, 5)


In [42]:
test_X = transform_errp_data(test_set, verbose=True)

X_reshaped: (3400, 16, 5)
final_X: (3400, 5)


In [18]:
X_train, X_val, y_train, y_val = train_test_split(final_X, y, test_size=0.3, random_state=RAND_STATE)
print(f'X_train: {X_train.shape}\nX_val: {X_val.shape}')

X_train: (3808, 5)
X_val: (1632, 5)


In [9]:
# Create custom scorer for GridSearchCV
# y_scores is the predicted probabilities
def auc_scorer(y_true, y_scores):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    return auc(fpr, tpr)
auc_score = make_scorer(auc_scorer, needs_proba=True)



In [10]:
mlp_grid = {
    'hidden_layer_sizes': [(10,), (20,), (50,), (100,), (200,)],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01]
}

In [None]:
gscv = GridSearchCV(estimator=MLPClassifier(random_state=RAND_STATE), 
                    param_grid=mlp_grid, 
                    cv=5, 
                    scoring=auc_score,
                    n_jobs=-1, 
                    verbose=2)

model = gscv.fit(X_train, y_train)
print(f'Best parameters: {model.best_params_}')
print(f'Best score: {model.best_score_}')

# Predict probabilities on the test set (for the positive class)
y_proba = model.predict_proba(X_val)[:, 1]

# Compute FPR, TPR
fpr, tpr, thresholds = roc_curve(y_val, y_proba)

# Compute AUC
final_auc = auc(fpr, tpr)

print("Final AUC score on val set:", final_auc)

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.01; total time=   0.4s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.01; total time=   0.6s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.01; total time=   0.3s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.01; total time=   0.5s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.01; total time=   0.4s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.001; total time=   1.5s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.001; total time=   1.6s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.001; total time=   1.4s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.001; total time=   1.5s
[CV] END alpha=0.0001, hidden_layer_sizes=(10,), learning_rate_init=0.001; total time=   1.7s
[CV

In [14]:
layer_search_grid = {
    'hidden_layer_sizes': [(200,), (200, 50), (300,), (300, 25)],
    'alpha': [0.0001, 0.00005],
    'learning_rate_init': [0.001, 0.0005],
    'activation': ['relu', 'tanh'],
    'solver': ['adam', 'sgd'],
}
gscv = GridSearchCV(estimator=MLPClassifier(random_state=RAND_STATE),
                    param_grid=layer_search_grid,
                    cv=5,
                    scoring=auc_score,
                    n_jobs=-1,
                    verbose=1)
model = gscv.fit(X_train, y_train)
print(f'Best parameters: {model.best_params_}')
print(f'Best score: {model.best_score_}')
# Predict probabilities on the test set (for the positive class)
y_proba = model.predict_proba(X_val)[:, 1]
# Compute FPR, TPR
fpr, tpr, thresholds = roc_curve(y_val, y_proba)
# Compute AUC
final_auc = auc(fpr, tpr)
print("Final AUC score on val set:", final_auc)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Best parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (300, 25), 'learning_rate_init': 0.0005, 'solver': 'adam'}
Best score: 0.5736804944346171
Final AUC score on val set: 0.5131454963644457


In [17]:
smote = SMOTE(random_state=RAND_STATE)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
print(f'X_train_smote: {X_train_smote.shape}')

adasyn = ADASYN(random_state=RAND_STATE)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)
print(f'X_train_adasyn: {X_train_adasyn.shape}')

X_train_smote: (5356, 5)
X_train_adasyn: (5490, 5)


In [None]:
oversample_grid = {
    'hidden_layer_sizes': [(200,), (300, 25), (300, 50)],
    'alpha': [0.0001, 0.0002],
    'learning_rate_init': [0.0005, 0.0007],
    'activation': ['tanh'],
    'solver': ['adam']
}
gscv = GridSearchCV(estimator=MLPClassifier(random_state=RAND_STATE),
                    param_grid=oversample_grid,
                    cv=5,
                    scoring=auc_score,
                    n_jobs=-1,
                    verbose=1)
model = gscv.fit(X_train_smote, y_train_smote)
print('SMOTE')
print(f'| Best parameters: {model.best_params_}')
print(f'| Best score: {model.best_score_}')
# Predict probabilities on the test set (for the positive class)
y_proba = model.predict_proba(X_val)[:, 1]
# Compute FPR, TPR
fpr, tpr, thresholds = roc_curve(y_val, y_proba)
# Compute AUC
final_auc = auc(fpr, tpr)
print("| Final AUC score on val set:", final_auc)
print('-' * 30)

model_ada = gscv.fit(X_train_adasyn, y_train_adasyn)
print('ADASYN')
print(f'| Best parameters: {model_ada.best_params_}')
print(f'| Best score: {model_ada.best_score_}')
# Predict probabilities on the test set (for the positive class)
y_proba = model_ada.predict_proba(X_val)[:, 1]
# Compute FPR, TPR
fpr, tpr, thresholds = roc_curve(y_val, y_proba)
# Compute AUC
final_auc = auc(fpr, tpr)
print("| Final AUC score on val set:", final_auc)
print('-' * 30)
print('Done')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
SMOTE
| Best parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (300, 50), 'learning_rate_init': 0.0005, 'solver': 'adam'}
| Best score: 0.5925837522354749
| Final AUC score on val set: 0.524769995548301
------------------------------
Fitting 5 folds for each of 12 candidates, totalling 60 fits




ADASYN
| Best parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (300, 50), 'learning_rate_init': 0.0007, 'solver': 'adam'}
| Best score: 0.5787866894464226
| Final AUC score on val set: 0.5400986793292774
------------------------------
Done


Current best model:
{
    'hidden_layer_sizes': (300,50),
    'alpha': 0.0001,
    'learning_rate_init': 0.0007,
    'solver: 'adam',
    'activation': 'tanh'
},
Oversampling with ADASYN

In [20]:
best_model = MLPClassifier(
    hidden_layer_sizes=(300, 50),
    alpha=0.0001,
    learning_rate_init=0.0007,
    activation='tanh',
    solver='adam',
)

In [43]:
best_model.fit(X_train_adasyn, y_train_adasyn)
pred = best_model.predict_proba(test_X)[:, 1]

In [45]:
print(f'pred: {pred.shape}')
print(pred)

pred: (3400,)
[0.38139176 0.60302282 0.53106223 ... 0.4828223  0.45095179 0.53559621]


In [46]:
# Construct IdFeedback column
ids = []
for i in range(len(test_subjects)):
    name = f'S{test_subjects[i]}'
    for session in range(1, 5):
        for fid in range(1, 61):
            ids.append(f'{name}_Sess{session:02}_FB{fid:03}')
    for fid in range(1, 101):
        ids.append(f'{name}_Sess05_FB{fid:03}')
ids = np.array(ids)
print(f'ids: {ids.shape}')
print(ids)

ids: (3400,)
['S01_Sess01_FB001' 'S01_Sess01_FB002' 'S01_Sess01_FB003' ...
 'S25_Sess05_FB098' 'S25_Sess05_FB099' 'S25_Sess05_FB100']


In [48]:
result = pd.DataFrame({
    'IdFeedback': ids,
    'Prediction': pred
})
result.to_csv('submissions/mlp_adasyn1.csv', sep=',', index=False)