In [47]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, LeaveOneOut
from sklearn.metrics import roc_auc_score
import numpy as np
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, auc, make_scorer
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
from detection import SubjectData
from utils import train_subjects, test_subjects
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

In [46]:
RAND_STATE = 42

In [49]:
labels = pd.read_csv('./data/TrainLabels.csv')
global_y = labels['Prediction'].to_numpy()
print(global_y.shape)

(5440,)


In [50]:
def get_session_and_trial(verbose=False):
    features = []
    for i in range(1, 5):
        for j in range(1, 61):
            features.append([i, j])
    for j in range(1, 101):
        features.append([5, j])
    features = np.array(features)
    if verbose: print(f'session_and_trial: {features.shape}')
    return features

def transform_data(X, verbose=False):
  extra_features = get_session_and_trial(verbose=verbose)
  extra_cols = []
  for i in range(X.shape[0] // extra_features.shape[0]):
    extra_cols.append(extra_features)
  extra_cols = np.array(extra_cols)
  extra_cols = extra_cols.reshape(extra_cols.shape[0] * extra_cols.shape[1], extra_cols.shape[2])
  if verbose: print(f'extra_cols: {extra_cols.shape}')
  final_X = np.hstack((extra_cols, X))
  return final_X

In [51]:
#remove columns 10, 8, 18, 16, 26, 24
def remove_high_correlation_columns(X, verbose=False):
  cols_to_remove = [10, 8, 18, 16, 26, 24]
  X = np.delete(X, cols_to_remove, axis=1)
  if verbose: print(f'X after removing columns: {X.shape}')
  return X

In [53]:
def loocv_with_resampling(X, y, model):
    loo = LeaveOneOut()
    y_true, y_pred, y_prob = [], [], []

    for train_idx, test_idx in loo.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # UnderSampling ➜ SMOTE
        rus = RandomUnderSampler(random_state=RAND_STATE)
        X_res, y_res = rus.fit_resample(X_train, y_train)

        smote = SMOTE(random_state=RAND_STATE)
        X_res, y_res = smote.fit_resample(X_res, y_res)

        model.fit(X_res, y_res)

        prob = model.predict_proba(X_test)[:, 1]
        pred = model.predict(X_test)

        y_true.append(y_test[0])
        y_pred.append(pred[0])
        y_prob.append(prob[0])

    return np.array(y_true), np.array(y_pred), np.array(y_prob)


def tune_subject_models(subject_list):
    param_grid = {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 7],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }

    best_models = {}
    best_scores = {}
    loocv_auc_scores = {}

    offset = 0
    for subject in subject_list:
        print(f"\n🎯 Tuning model for subject {subject}")

        # ---- Load & Preprocess ----
        sd = SubjectData(subject, train=True)
        X = np.hstack((
            sd.is_short.reshape(-1, 1),
            sd.get_green_similarity(),
            sd.get_feedback_similarity()
        ))
        y = global_y[offset : offset + 340]
        offset += 340

        X = transform_data(X)
        X = remove_high_correlation_columns(X)

        # ---- Under + SMOTE on whole set before GridSearch ----
        rus = RandomUnderSampler(random_state=RAND_STATE)
        X_bal, y_bal = rus.fit_resample(X, y)

        smote = SMOTE(random_state=RAND_STATE)
        X_bal, y_bal = smote.fit_resample(X_bal, y_bal)

        print(f"🔁 Resampled shape: {X_bal.shape}")

        # ---- Grid Search ----
        search = GridSearchCV(
            estimator=RandomForestClassifier(random_state=RAND_STATE),
            param_grid=param_grid,
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RAND_STATE),
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )
        search.fit(X_bal, y_bal)

        best_model = search.best_estimator_
        best_models[subject] = best_model
        best_scores[subject] = search.best_score_

        print(f"✅ Best AUC (GridSearch): {search.best_score_:.4f}")
        print(f"   Best Params: {search.best_params_}")

        # ---- Final LOOCV Evaluation ----
        y_true, y_pred, y_proba = loocv_with_resampling(X, y, best_model)
        loocv_auc = roc_auc_score(y_true, y_proba)
        loocv_auc_scores[subject] = loocv_auc

        print(f"📊 LOOCV AUC for subject {subject}: {loocv_auc:.4f}")
        print("-" * 50)

    return best_models, best_scores, loocv_auc_scores

In [54]:
tune_subject_models(train_subjects)


🎯 Tuning model for subject 02
🔁 Resampled shape: (240, 22)
Fitting 5 folds for each of 324 candidates, totalling 1620 fits


KeyboardInterrupt: 