 Step 1: Install and Import Libraries

In [None]:
!pip install optuna -q
!pip install imbalanced-learn -q
!pip install lightgbm -q

In [None]:
from sklearn.experimental import enable_iterative_imputer  # Add this line to enable IterativeImputer
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.impute import IterativeImputer, SimpleImputer # Now you can import IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from sklearn.multioutput import MultiOutputClassifier

from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.ensemble import GradientBoostingClassifier

import optuna

2. Load & Merge Data

In [None]:
# Load Data
train_cat = pd.read_excel('/content/TRAIN_CATEGORICAL_METADATA.xlsx')
train_func = pd.read_csv('/content/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')
train_quant = pd.read_excel('/content/TRAIN_QUANTITATIVE_METADATA.xlsx')
train_target = pd.read_excel('/content/TRAINING_SOLUTIONS.xlsx')

test_cat = pd.read_excel('/content/TEST_CATEGORICAL.xlsx')
test_func = pd.read_csv('/content/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
test_quant = pd.read_excel('/content/TEST_QUANTITATIVE_METADATA.xlsx')
var_list = pd.read_csv('/content/VAR_LIST (1) (1).csv')

In [None]:
# Drop site feature
train_cat.drop(columns=['Basic_Demos_Study_Site'], inplace=True, errors='ignore')
test_cat.drop(columns=['Basic_Demos_Study_Site'], inplace=True, errors='ignore')

In [None]:
# Variable grouping
categ_vars = var_list[(var_list['label'] == 'categ') & (var_list['var'] != 'Basic_Demos_Study_Site')]['var'].tolist()
quant_vars = var_list[var_list['label'] == 'quant']['var'].tolist()
mri_vars = var_list[var_list['label'] == 'connectome']['var'].tolist()

In [None]:
# Convert to float
for c in categ_vars:
    train_cat[c] = train_cat[c].astype(float)
    test_cat[c] = test_cat[c].astype(float)

In [None]:
# Merge only the quant and MRI training datasets based on 'participant_id'
df_train_merged = train_quant.merge(train_func, how='inner', on='participant_id')
df_train_merged = df_train_merged.merge(train_cat, how='inner', on='participant_id')

# Merge only the quant and MRI testing datasets based on 'participant_id'
df_test_merged = test_quant.merge(test_func, how='inner', on='participant_id')
df_test_merged = df_test_merged.merge(test_cat, how='inner', on='participant_id')

In [None]:
# Sort instances in both training and test datasets using participant_id
df_train_merged = df_train_merged.sort_values(by=['participant_id']).reset_index(drop=True)
df_test_merged = df_test_merged.sort_values(by=['participant_id']).reset_index(drop=True)

train_target = train_target.sort_values(by=['participant_id']).reset_index(drop=True)

In [None]:
# Stratified k-fold
train_stratify = train_target['ADHD_Outcome'].astype('str') + train_target['Sex_F'].astype('str')
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=40)

In [None]:
# Prepare the training and testing data by separating features (X) and labels (y).
# X_train and X_test contain all columns from the merged DataFrames except for the first column ('participant_id')
# reset_index(drop=True) is used to reset the index of the DataFrame after dropping 'participant_id', ensuring that the index goes from 0 to n-1.
X_train = df_train_merged.iloc[:, 1:].reset_index(drop=True)
X_test = df_test_merged.iloc[:, 1:].reset_index(drop=True)

# y_train contains only the 'ADHD_Outcome' and 'Sex_F' columns from the training labels DataFrame.
y_train = train_target[['ADHD_Outcome', 'Sex_F']]

In [None]:
# Initialize arrays to store out-of-fold and test predictions for ADHD and sex.
oof_preds_adhd = np.zeros(X_train.shape[0])
oof_preds_sex = np.zeros(X_train.shape[0])
test_preds_adhd = np.zeros(X_test.shape[0])
test_preds_sex = np.zeros(X_test.shape[0])

In [None]:
# Get indices of categorical and numerical variables.
categ_vars_inds = [X_train.columns.get_loc(col) for col in categ_vars]
all_num_vars_inds = [X_train.columns.get_loc(col) for col in quant_vars + mri_vars]

Step 3: Define Preprocessing Pipelines

In [None]:
preprocessor_imputer = ColumnTransformer(transformers=[
  ('num_imputer', IterativeImputer(
      estimator=LinearRegression(),
      max_iter=20,
      n_nearest_features=500,
      initial_strategy='mean',
      random_state=123,
      skip_complete=True,
      tol=1e-2), all_num_vars_inds
   ),
  ('categ_imputer_01', SimpleImputer(strategy='constant', fill_value=3), [categ_vars_inds[0]]),
  ('categ_imputer_02', SimpleImputer(strategy='constant', fill_value=10), [categ_vars_inds[1]]),
  ('categ_imputer_03', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[2]]),
  ('categ_imputer_04', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[3]]),
  ('categ_imputer_05', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[4]]),
  ('categ_imputer_06', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[5]]),
  ('categ_imputer_07', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[6]])
], remainder='passthrough')


preprocessor_pca_encoding = ColumnTransformer(transformers=[
    ('pca', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=0.95))]), all_num_vars_inds),
    ('onehotencoder', OneHotEncoder(), categ_vars_inds)
], remainder='passthrough')

In [None]:
preprocessor_pipeline_imputer = Pipeline([
  ('preprocessor_imputer', preprocessor_imputer)
])


preprocessor_pipeline_encoding = Pipeline([
  ('preprocessor_pca_encoding', preprocessor_pca_encoding)
])

Step 4: Define Optuna Objective Function

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 15, 60),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'random_state': 42,
        'n_jobs': -1
    }

    model = MultiOutputClassifier(LGBMClassifier(**params), n_jobs=-1)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    f1_scores = []

    for train_idx, val_idx in skf.split(X_train, train_stratify):
        # Use .iloc to select rows by integer index:
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model.fit(X_tr, y_tr)
        preds = model.predict(X_val)
        f1_adhd = f1_score(y_val['ADHD_Outcome'], preds[:, 0])
        f1_sex = f1_score(y_val['Sex_F'], preds[:, 1])
        f1_scores.append((f1_adhd + f1_sex) / 2)

    return np.mean(f1_scores)

Step 5: Run Hyperparameter Search

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20, timeout=600)
print("Best parameters:", study.best_params)

[I 2025-04-30 01:25:30,198] A new study created in memory with name: no-name-491712a6-d25a-4ac8-affd-d0987f469377
[W 2025-04-30 01:25:36,596] Trial 0 failed with parameters: {'n_estimators': 193, 'learning_rate': 0.1814314028526615, 'max_depth': 10, 'num_leaves': 56, 'subsample': 0.6224535123919306, 'colsample_bytree': 0.796585848742142, 'reg_alpha': 1.2003861735038024, 'reg_lambda': 0.9410250655285446, 'min_child_samples': 26} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-41-558756a41382>", line 26, in objective
    model.fit(X_tr, y_tr)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/multioutput.py", line 543, in fit
    super().fit(X, Y, sample_weight=sample_weight, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", l

KeyboardInterrupt: 

In [None]:
"""def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 15, 60),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'random_state': 42,
        'n_jobs': -1
    }

    model = MultiOutputClassifier(LGBMClassifier(**params), n_jobs=-1)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    f1_scores = []

    for train_idx, val_idx in skf.split(X_train, train_stratify): # Changed X_train_final to X_train
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Apply preprocessing steps within the loop
        X_tr_imp = preprocessor_pipeline_imputer.fit_transform(X_tr)
        X_val_imp = preprocessor_pipeline_imputer.transform(X_val)

        X_tr_final = preprocessor_pipeline_encoding.fit_transform(X_tr_imp) # Apply encoding
        X_val_final = preprocessor_pipeline_encoding.transform(X_val_imp)    # Apply encoding


        model.fit(X_tr_final, y_tr) # Changed X_tr to X_tr_final
        preds = model.predict(X_val_final) # Changed X_val to X_val_final
        f1_adhd = f1_score(y_val['ADHD_Outcome'], preds[:, 0])
        f1_sex = f1_score(y_val['Sex_F'], preds[:, 1])
        f1_scores.append((f1_adhd + f1_sex) / 2)

    return np.mean(f1_scores)"""

In [None]:
"""def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 15, 63),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'random_state': 42,
        'n_jobs': -1
    }

    for train_idx, val_idx in skf.split(X_train, train_stratify):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        X_tr_imp = preprocessor_imputer.fit_transform(X_tr)
        X_val_imp = preprocessor_imputer.transform(X_val)

        smote_nc = SMOTENC(categorical_features=categ_vars_inds, random_state=0)
        strat = y_tr['ADHD_Outcome'].astype(str) + y_tr['Sex_F'].astype(str)
        X_tr_res, y_tr_str = smote_nc.fit_resample(X_tr_imp, strat)
        y_tr_final = pd.DataFrame({'ADHD_Outcome': y_tr_str.str[0].astype(int), 'Sex_F': y_tr_str.str[1].astype(int)})

        X_tr_enc = preprocessor_encoder.fit_transform(X_tr_res)
        X_val_enc = preprocessor_encoder.transform(X_val_imp)

        model = MultiOutputClassifier(LGBMClassifier(**params), n_jobs=-1)
        model.fit(X_tr_enc, y_tr_final)

        pred = model.predict(X_val_enc)
        f1_adhd = f1_score(y_val['ADHD_Outcome'], pred[:, 0])
        f1_sex = f1_score(y_val['Sex_F'], pred[:, 1])
        return (f1_adhd + f1_sex) / 2  # Only one fold needed for tuning"""

Step 6: Train Final Model with Best Params

In [None]:
best_params = study.best_params
oof_preds = np.zeros((X_train.shape[0], 2))
test_preds = np.zeros((X_test.shape[0], 2))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, train_stratify)):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    X_tr_imp = preprocessor_imputer.fit_transform(X_tr)
    X_val_imp = preprocessor_imputer.transform(X_val)
    X_test_imp = preprocessor_imputer.transform(X_test)

    smote_nc = SMOTENC(categorical_features=categ_vars_inds, random_state=0)
    strat = y_tr['ADHD_Outcome'].astype(str) + y_tr['Sex_F'].astype(str)
    X_tr_res, y_tr_str = smote_nc.fit_resample(X_tr_imp, strat)
    y_tr_final = pd.DataFrame({'ADHD_Outcome': y_tr_str.str[0].astype(int), 'Sex_F': y_tr_str.str[1].astype(int)})

    X_tr_enc = preprocessor_encoder.fit_transform(X_tr_res)
    X_val_enc = preprocessor_encoder.transform(X_val_imp)
    X_test_enc = preprocessor_encoder.transform(X_test_imp)

    model = MultiOutputClassifier(LGBMClassifier(**best_params), n_jobs=-1)
    model.fit(X_tr_enc, y_tr_final)

    oof_preds[val_idx, :] = model.predict_proba(X_val_enc)
    test_preds += np.array([clf.predict_proba(X_test_enc)[:, 1] for clf in model.estimators_]).T / skf.n_splits


ValueError: No trials are completed yet.

Step 7: Threshold Optimization

In [None]:
def weighted_f1_score(y_true_adhd, y_pred_adhd, y_true_sex, y_pred_sex):
    weights = [2 if (a == 1 and s == 1) else 1 for a, s in zip(y_true_adhd, y_true_sex)]
    def compute_f1(y_true, y_pred, weights):
        TP = sum(w for i, w in enumerate(weights) if y_true[i] == 1 and y_pred[i] == 1)
        FP = sum(w for i, w in enumerate(weights) if y_true[i] == 0 and y_pred[i] == 1)
        FN = sum(w for i, w in enumerate(weights) if y_true[i] == 1 and y_pred[i] == 0)
        precision = TP / (TP + FP + 1e-6)
        recall = TP / (TP + FN + 1e-6)
        return 2 * precision * recall / (precision + recall + 1e-6)
    return (compute_f1(y_true_adhd, y_pred_adhd, weights) + compute_f1(y_true_sex, y_pred_sex, weights)) / 2

thresholds = np.linspace(0, 1, 101)
best_f1, best_t1, best_t2 = 0, 0.5, 0.5
for t1 in thresholds:
    for t2 in thresholds:
        preds_adhd = (oof_preds[:, 0] >= t1).astype(int)
        preds_sex = (oof_preds[:, 1] >= t2).astype(int)
        score = weighted_f1_score(y_train['ADHD_Outcome'], preds_adhd, y_train['Sex_F'], preds_sex)
        if score > best_f1:
            best_f1, best_t1, best_t2 = score, t1, t2

print(f'Best thresholds: ADHD = {best_t1}, Sex_F = {best_t2}')


Step 8: Generate Submission

In [None]:
final = pd.DataFrame({
    'participant_id': df_test['participant_id'],
    'ADHD_Outcome': (test_preds[:, 0] >= best_t1).astype(int),
    'Sex_F': (test_preds[:, 1] >= best_t2).astype(int)
})
final.to_csv('final_submission.csv', index=False)
