In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, LeaveOneOut
from sklearn.metrics import roc_auc_score
import numpy as np
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve, auc, make_scorer
from imblearn.over_sampling import SMOTE, ADASYN
import matplotlib.pyplot as plt
from detection import SubjectData
from utils import train_subjects, test_subjects
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
RAND_STATE = 42

In [3]:
labels = pd.read_csv('./data/TrainLabels.csv')
global_y = labels['Prediction'].to_numpy()
print(global_y.shape)

(5440,)


In [4]:
def get_session_and_trial(verbose=False):
    features = []
    for i in range(1, 5):
        for j in range(1, 61):
            features.append([i, j])
    for j in range(1, 101):
        features.append([5, j])
    features = np.array(features)
    if verbose: print(f'session_and_trial: {features.shape}')
    return features

def transform_data(X, verbose=False):
  extra_features = get_session_and_trial(verbose=verbose)
  extra_cols = []
  for i in range(X.shape[0] // extra_features.shape[0]):
    extra_cols.append(extra_features)
  extra_cols = np.array(extra_cols)
  extra_cols = extra_cols.reshape(extra_cols.shape[0] * extra_cols.shape[1], extra_cols.shape[2])
  if verbose: print(f'extra_cols: {extra_cols.shape}')
  final_X = np.hstack((extra_cols, X))
  return final_X

In [5]:
#remove columns 10, 8, 18, 16, 26, 24
def remove_high_correlation_columns(X, verbose=False):
  cols_to_remove = [10, 8, 18, 16, 26, 24]
  X = np.delete(X, cols_to_remove, axis=1)
  if verbose: print(f'X after removing columns: {X.shape}')
  return X

In [7]:
def loocv_with_resampling(X, y, model):
    loo = LeaveOneOut()
    y_true, y_pred, y_prob = [], [], []

    for train_idx, test_idx in loo.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # UnderSampling ➜ SMOTE
        rus = RandomUnderSampler(random_state=RAND_STATE)
        X_res, y_res = rus.fit_resample(X_train, y_train)

        smote = SMOTE(random_state=RAND_STATE)
        X_res, y_res = smote.fit_resample(X_res, y_res)

        model.fit(X_res, y_res)

        prob = model.predict_proba(X_test)[:, 1]
        pred = model.predict(X_test)

        y_true.append(y_test[0])
        y_pred.append(pred[0])
        y_prob.append(prob[0])

    return np.array(y_true), np.array(y_pred), np.array(y_prob)


def tune_subject_models(subject_list):
    param_grid = {
        'n_estimators': [40, 50, 100, 150, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 7],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2', None]
    }

    best_models = {}
    best_scores = {}
    loocv_auc_scores = {}

    offset = 0
    for subject in subject_list:
        print(f"\n🎯 Tuning model for subject {subject}")

        # ---- Load & Preprocess ----
        sd = SubjectData(subject, train=True)
        X = np.hstack((
            sd.is_short.reshape(-1, 1),
            sd.get_green_similarity(),
            sd.get_feedback_similarity()
        ))
        y = global_y[offset : offset + 340]
        offset += 340

        X = transform_data(X)
        X = remove_high_correlation_columns(X)

        # ---- Under + SMOTE on whole set before GridSearch ----
        rus = RandomUnderSampler(random_state=RAND_STATE)
        X_bal, y_bal = rus.fit_resample(X, y)

        smote = SMOTE(random_state=RAND_STATE)
        X_bal, y_bal = smote.fit_resample(X_bal, y_bal)

        print(f"🔁 Resampled shape: {X_bal.shape}")

        # ---- Grid Search ----
        search = GridSearchCV(
            estimator=RandomForestClassifier(random_state=RAND_STATE),
            param_grid=param_grid,
            cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RAND_STATE),
            scoring='roc_auc',
            n_jobs=-1,
            verbose=1
        )
        search.fit(X_bal, y_bal)

        best_model = search.best_estimator_
        best_models[subject] = best_model
        best_scores[subject] = search.best_score_

        print(f"✅ Best AUC (GridSearch): {search.best_score_:.4f}")
        print(f"   Best Params: {search.best_params_}")

        # ---- Final LOOCV Evaluation ----
        y_true, y_pred, y_proba = loocv_with_resampling(X, y, best_model)
        loocv_auc = roc_auc_score(y_true, y_proba)
        loocv_auc_scores[subject] = loocv_auc

        print(f"📊 LOOCV AUC for subject {subject}: {loocv_auc:.4f}")
        print("-" * 50)

    return best_models, best_scores, loocv_auc_scores

In [9]:
best_models, best_scores, loocv_auc_score = tune_subject_models(train_subjects)


🎯 Tuning model for subject 02
🔁 Resampled shape: (240, 22)
Fitting 5 folds for each of 405 candidates, totalling 2025 fits
✅ Best AUC (GridSearch): 0.7264
   Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 150}
📊 LOOCV AUC for subject 02: 0.6961
--------------------------------------------------

🎯 Tuning model for subject 06
🔁 Resampled shape: (48, 22)
Fitting 5 folds for each of 405 candidates, totalling 2025 fits
✅ Best AUC (GridSearch): 0.6760
   Best Params: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 50}
📊 LOOCV AUC for subject 06: 0.5359
--------------------------------------------------

🎯 Tuning model for subject 07
🔁 Resampled shape: (66, 22)
Fitting 5 folds for each of 405 candidates, totalling 2025 fits
✅ Best AUC (GridSearch): 0.5918
   Best Params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_es

  _data = np.array(data, dtype=dtype, copy=copy,


✅ Best AUC (GridSearch): 0.7901
   Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 40}
📊 LOOCV AUC for subject 17: 0.8126
--------------------------------------------------

🎯 Tuning model for subject 18
🔁 Resampled shape: (158, 22)
Fitting 5 folds for each of 405 candidates, totalling 2025 fits
✅ Best AUC (GridSearch): 0.5547
   Best Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
📊 LOOCV AUC for subject 18: 0.5116
--------------------------------------------------

🎯 Tuning model for subject 20
🔁 Resampled shape: (208, 22)
Fitting 5 folds for each of 405 candidates, totalling 2025 fits


  _data = np.array(data, dtype=dtype, copy=copy,


✅ Best AUC (GridSearch): 0.6825
   Best Params: {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
📊 LOOCV AUC for subject 20: 0.6540
--------------------------------------------------

🎯 Tuning model for subject 21
🔁 Resampled shape: (56, 22)
Fitting 5 folds for each of 405 candidates, totalling 2025 fits
✅ Best AUC (GridSearch): 0.7978
   Best Params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
📊 LOOCV AUC for subject 21: 0.7382
--------------------------------------------------

🎯 Tuning model for subject 22
🔁 Resampled shape: (54, 22)
Fitting 5 folds for each of 405 candidates, totalling 2025 fits
✅ Best AUC (GridSearch): 0.8400
   Best Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
📊 LOOCV AUC for subject 22: 0.7306
--------------------------------------------------

🎯 Tuning model for subj

In [18]:
def get_subject_mean_vector(sd):
    """Υπολογίζει mean embedding vector για ένα subject."""
    features = np.hstack((
        sd.is_short.reshape(-1, 1),
        sd.get_green_similarity(),
        sd.get_feedback_similarity()
    ))
    features = transform_data(features)
    features = remove_high_correlation_columns(features)
    return features.mean(axis=0).reshape(1, -1)


def compute_similarity_matrix(train_subjects, test_subjects):
    """Υπολογίζει πίνακα ομοιότητας (cosine) μεταξύ test και train subjects."""
    similarity_matrix = {}
    train_vectors = {subj: get_subject_mean_vector(SubjectData(subj, train=True)) for subj in train_subjects}

    for test_subj in test_subjects:
        test_vec = get_subject_mean_vector(SubjectData(test_subj, train=False))
        similarity_matrix[test_subj] = {}

        for train_subj, train_vec in train_vectors.items():
            sim = cosine_similarity(test_vec, train_vec)[0, 0]
            similarity_matrix[test_subj][train_subj] = sim

    return similarity_matrix


def predict_for_test_subjects(test_subjects, train_subjects, best_models, loocv_auc_scores, similarity_matrix):
    """Συνδυάζει τις προβλέψεις των μοντέλων με βάση την ομοιότητα + αξιοπιστία (AUC)."""
    final_predictions = {}
    final_probabilities = {}

    for test_subj in test_subjects:
        print(f"\n🔎 Predicting for test subject: {test_subj}")
        sd_test = SubjectData(test_subj, train=False)
        X_test = np.hstack((
            sd_test.is_short.reshape(-1, 1),
            sd_test.get_green_similarity(),
            sd_test.get_feedback_similarity()
        ))
        X_test = transform_data(X_test)
        X_test = remove_high_correlation_columns(X_test)

        probs_per_model = []
        weights = []

        for train_subj in train_subjects:
            #print(f"   - Using model from subject {train_subj}")
            model = best_models[train_subj]
            auc = loocv_auc_scores[train_subj]
            similarity = similarity_matrix[test_subj][train_subj]

            # βάρος = similarity * auc
            weight = similarity * auc
            weights.append(weight)

            proba = model.predict_proba(X_test)[:, 1]
            probs_per_model.append(proba)

        # κανονικοποίηση βαρών
        weights = np.array(weights)
        weights /= weights.sum()

        # weighted average πρόβλεψη
        probs_per_model = np.array(probs_per_model)
        final_proba = np.average(probs_per_model, axis=0, weights=weights)
        final_pred = (final_proba >= 0.5).astype(int)

        final_predictions[test_subj] = final_pred
        final_probabilities[test_subj] = final_proba

        print(f"✅ Completed prediction for {test_subj}")

    return final_predictions, final_probabilities


In [13]:
sim_matrix = compute_similarity_matrix(train_subjects, test_subjects)

In [19]:
final_predictions, final_probabilities = predict_for_test_subjects(test_subjects, train_subjects, best_models, loocv_auc_score, sim_matrix)


🔎 Predicting for test subject: 01
✅ Completed prediction for 01

🔎 Predicting for test subject: 03
✅ Completed prediction for 03

🔎 Predicting for test subject: 04
✅ Completed prediction for 04

🔎 Predicting for test subject: 05
✅ Completed prediction for 05

🔎 Predicting for test subject: 08
✅ Completed prediction for 08

🔎 Predicting for test subject: 09
✅ Completed prediction for 09

🔎 Predicting for test subject: 10
✅ Completed prediction for 10

🔎 Predicting for test subject: 15
✅ Completed prediction for 15

🔎 Predicting for test subject: 19
✅ Completed prediction for 19

🔎 Predicting for test subject: 25
✅ Completed prediction for 25


In [20]:
# Construct IdFeedback column
ids = []
for i in range(len(test_subjects)):
    name = f'S{test_subjects[i]}'
    for session in range(1, 5):
        for fid in range(1, 61):
            ids.append(f'{name}_Sess{session:02}_FB{fid:03}')
    for fid in range(1, 101):
        ids.append(f'{name}_Sess05_FB{fid:03}')
ids = np.array(ids)
print(f'ids: {ids.shape}')
print(ids)

ids: (3400,)
['S01_Sess01_FB001' 'S01_Sess01_FB002' 'S01_Sess01_FB003' ...
 'S25_Sess05_FB098' 'S25_Sess05_FB099' 'S25_Sess05_FB100']


In [22]:
# Αποθήκευση των αποτελεσμάτων
xgb_result = pd.DataFrame({
    'IdFeedback': ids,
    'Prediction': np.concatenate(list(final_probabilities.values())),
})
xgb_result.to_csv('submissions/rf_ensemble_loocv.csv', sep=',', index=False)