1. Imports & Setup

In [None]:
!pip install optuna -U -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Basic Imports
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.experimental import enable_iterative_imputer  # Enable IterativeImputer
from sklearn.impute import IterativeImputer,SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Modeling
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.multioutput import MultiOutputClassifier

# Evaluation and Splitting
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score, make_scorer

# Hyperparameter Optimization
import optuna

# Utility
from sklearn.decomposition import PCA
from sklearn.utils import resample
import joblib

2. Load & Merge Data

In [None]:
# Load Data
train_cat = pd.read_excel('/content/TRAIN_CATEGORICAL_METADATA.xlsx')
train_func = pd.read_csv('/content/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv')
train_quant = pd.read_excel('/content/TRAIN_QUANTITATIVE_METADATA.xlsx')
train_target = pd.read_excel('/content/TRAINING_SOLUTIONS.xlsx')

test_cat = pd.read_excel('/content/TEST_CATEGORICAL.xlsx')
test_func = pd.read_csv('/content/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv')
test_quant = pd.read_excel('/content/TEST_QUANTITATIVE_METADATA.xlsx')

In [None]:
# prompt: Create a list of train participants and test participants
train_participants = train_cat['participant_id'].tolist()  # Assuming 'participant_id' is in train_cat
test_participants = test_cat['participant_id'].tolist()  # Assuming 'participant_id' is in test_cat

In [None]:
# prompt: Delete feature - ''Basic_Demos_Study_Site' from both train and test data

# Assuming train_cat and test_cat contain the 'Basic_Demos_Study_Site' column
# Delete 'Basic_Demos_Study_Site' from train and test data
if 'Basic_Demos_Study_Site' in train_cat.columns:
    train_cat = train_cat.drop('Basic_Demos_Study_Site', axis=1)
if 'Basic_Demos_Study_Site' in test_cat.columns:
    test_cat = test_cat.drop('Basic_Demos_Study_Site', axis=1)

In [None]:
var_list = pd.read_csv('/content/VAR_LIST (1) (1).csv')
var_list.shape

(19928, 3)

In [None]:
categ_vars = var_list.loc[(var_list['label'] == 'categ') & (var_list['var'] != 'Basic_Demos_Study_Site'), 'var'].tolist()
quant_vars = var_list.loc[var_list['label'] == 'quant', 'var'].tolist()
mri_vars = var_list.loc[var_list['label'] == 'connectome', 'var'].tolist()
id_vars = ['participant_id']
label_vars = ['ADHD_Outcome',	'Sex_F',	'Combined_Outcome']

In [None]:
# Convert categorical variables to float type in both training and testing datasets.
for c in categ_vars:
  train_cat[c] = train_cat[c].astype('float') # Changed df_train_cat to train_cat
  test_cat[c] = test_cat[c].astype('float') # Changed df_test_cat to test_cat

In [None]:
# Merge only the quant and MRI training datasets based on 'participant_id'
df_train_merged = train_quant.merge(train_func, how='inner', on='participant_id')
df_train_merged = df_train_merged.merge(train_cat, how='inner', on='participant_id')

# Merge only the quant and MRI testing datasets based on 'participant_id'
df_test_merged = test_quant.merge(test_func, how='inner', on='participant_id')
df_test_merged = df_test_merged.merge(test_cat, how='inner', on='participant_id')

In [None]:
# Sort instances in both training and test datasets using participant_id
df_train_merged = df_train_merged.sort_values(by=['participant_id']).reset_index(drop=True)
df_test_merged = df_test_merged.sort_values(by=['participant_id']).reset_index(drop=True)

In [None]:
train_target = train_target.sort_values(by=['participant_id']).reset_index(drop=True)

In [None]:
# Ensure consistent participants across features and labels
df_train_merged = df_train_merged.sort_values('participant_id')
train_target = train_target.sort_values('participant_id')

# Keep only common participants
common_ids = set(df_train_merged['participant_id']) & set(train_target['participant_id'])
df_train_merged = df_train_merged[df_train_merged['participant_id'].isin(common_ids)].reset_index(drop=True)
train_target = train_target[train_target['participant_id'].isin(common_ids)].reset_index(drop=True)

# Recreate X_train and y_train
X_train = df_train_merged.drop(columns=['participant_id'])
y_train = train_target[['ADHD_Outcome', 'Sex_F']]

# Create stratification variable
train_stratify = y_train['ADHD_Outcome'].astype(str) + y_train['Sex_F'].astype(str)

# Final check
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("train_stratify length:", len(train_stratify))


X_train shape: (1213, 19926)
y_train shape: (1213, 2)
train_stratify length: 1213


In [None]:
# Utility for Stratified K-Fold
# n_splits = 5
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=40)

In [None]:
train_stratify = train_target['ADHD_Outcome'].astype('str') + train_target['Sex_F'].astype('str')
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=40)

In [None]:
# Prepare the training and testing data by separating features (X) and labels (y).
# Ensure that the target variable 'train_stratify' is derived from the same data source as X_train
# to avoid the inconsistent number of samples error.

# Assuming 'df_train_merged' is the DataFrame used to create X_train:
# train_stratify = df_train_merged['ADHD_Outcome'].astype('str') + df_train_merged['Sex_F'].astype('str')  # Derive train_stratify from y_train instead of df_train_merged
# train_stratify = y_train['ADHD_Outcome'].astype('str') + y_train['Sex_F'].astype('str') # Use y_train to derive train_stratify

# Define y_train before using it to derive train_stratify
y_train = train_target[['ADHD_Outcome', 'Sex_F']]  # Extract the target columns from train_target

# Now you can use y_train to create train_stratify
train_stratify = y_train['ADHD_Outcome'].astype(str) + y_train['Sex_F'].astype(str)

# Now proceed with creating X_train and y_train as before:
X_train = df_train_merged.iloc[:, 1:].reset_index(drop=True)
X_test = df_test_merged.iloc[:, 1:].reset_index(drop=True)

# ... rest of your code ...

In [None]:
# Initialize arrays to store out-of-fold and test predictions for ADHD and sex.
oof_preds_adhd = np.zeros(X_train.shape[0])
oof_preds_sex = np.zeros(X_train.shape[0])
test_preds_adhd = np.zeros(X_test.shape[0])
test_preds_sex = np.zeros(X_test.shape[0])

In [None]:
# Get indices of categorical and numerical variables.
categ_vars_inds = [X_train.columns.get_loc(col) for col in categ_vars]
all_num_vars_inds = [X_train.columns.get_loc(col) for col in quant_vars + mri_vars]


In [None]:
preprocessor_imputer = ColumnTransformer(transformers=[
  ('num_imputer', IterativeImputer(
      estimator=LinearRegression(),
      max_iter=20,
      n_nearest_features=500,
      initial_strategy='mean',
      random_state=123,
      skip_complete=True,
      tol=1e-2), all_num_vars_inds
   ),
  ('categ_imputer_01', SimpleImputer(strategy='constant', fill_value=3), [categ_vars_inds[0]]),
  ('categ_imputer_02', SimpleImputer(strategy='constant', fill_value=10), [categ_vars_inds[1]]),
  ('categ_imputer_03', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[2]]),
  ('categ_imputer_04', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[3]]),
  ('categ_imputer_05', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[4]]),
  ('categ_imputer_06', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[5]]),
  ('categ_imputer_07', SimpleImputer(strategy='constant', fill_value=99), [categ_vars_inds[6]])
], remainder='passthrough')


preprocessor_pca_encoding = ColumnTransformer(transformers=[
    ('pca', Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=0.95))]), all_num_vars_inds),
    ('onehotencoder', OneHotEncoder(), categ_vars_inds)
], remainder='passthrough')

In [None]:
preprocessor_pipeline_imputer = Pipeline([
  ('preprocessor_imputer', preprocessor_imputer)
])


preprocessor_pipeline_encoding = Pipeline([
  ('preprocessor_pca_encoding', preprocessor_pca_encoding)
])

In [None]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier  # only used for importance modeling
from xgboost import XGBClassifier # import XGBClassifier
from imblearn.over_sampling import SMOTENC

In [None]:
# List of models for ensemble
models = [
    MultiOutputClassifier(LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, num_leaves=15, random_state=123, n_jobs=-1)),
    MultiOutputClassifier(XGBClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, subsample=0.7, colsample_bytree=0.7, random_state=123)),
    MultiOutputClassifier(GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=123)),
    MultiOutputClassifier(LogisticRegression(penalty='l2', random_state=123))  # Changed penalty to 'l2' for Ridge regularization
]

In [None]:
# Training loop with cross-validation
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, train_stratify)):
    print(f'\nFold {fold + 1}')

    # Split train/val data
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Preprocess: Imputation + Scaling
    X_tr_processed = preprocessor_pipeline_imputer.fit_transform(X_tr)
    X_val_processed = preprocessor_pipeline_imputer.transform(X_val)
    X_test_processed = preprocessor_pipeline_imputer.transform(X_test)

    # Upsample minority using SMOTENC
    tr_stratify = y_tr['ADHD_Outcome'].astype(str) + y_tr['Sex_F'].astype(str)
    smote_nc = SMOTENC(categorical_features=categ_vars_inds, random_state=123)
    X_tr_resampled, y_strat_combined = smote_nc.fit_resample(X_tr_processed, tr_stratify)

    y_tr_resampled = pd.DataFrame({
        'ADHD_Outcome': y_strat_combined.str[0].astype(int),
        'Sex_F': y_strat_combined.str[1].astype(int)
    })

    # One-hot encode and transform
    X_tr_resampled = preprocessor_pipeline_encoding.fit_transform(X_tr_resampled)
    X_val_processed = preprocessor_pipeline_encoding.transform(X_val_processed)
    X_test_processed = preprocessor_pipeline_encoding.transform(X_test_processed)

    # Arrays to hold the predictions for each model
    val_preds_all = []
    test_preds_all = []

    # Loop through models to train and make predictions
    for model in models:
        model.fit(X_tr_resampled, y_tr_resampled)

        # Get out-of-fold predictions
        val_preds_all.append(model.predict_proba(X_val_processed))

        # Get test predictions
        test_preds_all.append(model.predict_proba(X_test_processed))

    # Average the predicted probabilities for each target across all models
    avg_val_preds = [np.mean([p[i][:, 1] for p in val_preds_all], axis=0) for i in range(2)]
    avg_test_preds = [np.mean([p[i][:, 1] for p in test_preds_all], axis=0) for i in range(2)]

    # Store the averaged predictions
    oof_preds_adhd[val_idx] = avg_val_preds[0]
    oof_preds_sex[val_idx] = avg_val_preds[1]
    test_preds_adhd += avg_test_preds[0] / skf.n_splits
    test_preds_sex += avg_test_preds[1] / skf.n_splits


Fold 1




[LightGBM] [Info] Number of positive: 928, number of negative: 928
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.023919 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 180124
[LightGBM] [Info] Number of data points in the train set: 1856, number of used features: 753
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 928, number of negative: 928
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 180124
[LightGBM] [Info] Number of data points in the train set: 1856, number of used features: 753
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000





Fold 2




[LightGBM] [Info] Number of positive: 930, number of negative: 930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032436 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179359
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 750
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 930, number of negative: 930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179359
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 750
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000





Fold 3




[LightGBM] [Info] Number of positive: 930, number of negative: 930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023789 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 179357
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 749
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 930, number of negative: 930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033350 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179357
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 749
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000





Fold 4




[LightGBM] [Info] Number of positive: 930, number of negative: 930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036291 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179104
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 749
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 930, number of negative: 930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179104
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 749
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000





Fold 5




[LightGBM] [Info] Number of positive: 930, number of negative: 930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036893 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179357
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 749
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




[LightGBM] [Info] Number of positive: 930, number of negative: 930
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 179357
[LightGBM] [Info] Number of data points in the train set: 1860, number of used features: 749
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




In [None]:
def weighted_f1_score(y_true_adhd, y_pred_adhd, y_true_sex, y_pred_sex):
  weights = [2 if (a == 1 and s == 1) else 1
          for a, s in zip(y_true_adhd, y_true_sex)]

  def compute_f1(y_true, y_pred, weights):
      TP = sum(w for i, w in enumerate(weights) if y_true[i] == 1 and y_pred[i] == 1)
      FP = sum(w for i, w in enumerate(weights) if y_true[i] == 0 and y_pred[i] == 1)
      FN = sum(w for i, w in enumerate(weights) if y_true[i] == 1 and y_pred[i] == 0)

      if TP + FP == 0 or TP + FN == 0:
          return 0.0

      precision = TP / (TP + FP)
      recall = TP / (TP + FN)
      if precision + recall == 0:
          return 0.0
      f1 = 2 * precision * recall / (precision + recall)
      return f1

  f1_adhd = compute_f1(y_true_adhd, y_pred_adhd, weights)
  f1_sex = compute_f1(y_true_sex, y_pred_sex, weights)

  # Final F1 on the leaderboard
  return (f1_adhd + f1_sex) / 2

In [None]:
# Threshold optimization and final predictions as you've done
thresholds = np.linspace(0, 1, 101)
best_score = 0
best_t1 = 0
best_t2 = 0
score_val, t1_val, t2_val = [], [], []

In [None]:
y_adhd = y_train['ADHD_Outcome']
y_sex = y_train['Sex_F']

probs_adhd = oof_preds_adhd
probs_sex = oof_preds_sex

In [None]:
for t1 in thresholds:
  for t2 in thresholds:
    preds_adhd = (probs_adhd >= t1).astype(int)
    preds_sex = (probs_sex >= t2).astype(int)

    weighted_f1  = weighted_f1_score(y_adhd, preds_adhd, y_sex, preds_sex)
    score_val.append(weighted_f1)
    t1_val.append(t1)
    t2_val.append(t2)

In [None]:
df_scores = pd.DataFrame({'f1': score_val, 't1': t1_val, 't2': t2_val})

In [None]:
df_scores = df_scores.sort_values(by='f1', ascending=False)

In [None]:
df_scores.iloc[:50]

Unnamed: 0,f1,t1,t2
1535,0.751632,0.15,0.2
1534,0.751576,0.15,0.19
1333,0.751524,0.13,0.2
222,0.751474,0.02,0.2
727,0.751474,0.07,0.2
626,0.751474,0.06,0.2
525,0.751474,0.05,0.2
121,0.751474,0.01,0.2
424,0.751474,0.04,0.2
323,0.751474,0.03,0.2


In [None]:
t_adhd = df_scores.iloc[:50, 1].median()

In [None]:
t_sex = df_scores.iloc[:50, 2].median()

In [None]:
preds_test_adhd = (test_preds_adhd >= t_adhd).astype(int)
preds_test_sex = (test_preds_sex >= t_sex).astype(int)

In [None]:
final = pd.DataFrame()

In [None]:
final['participant_id'] = df_test_merged['participant_id']
final['Sex_F'] = preds_test_sex
final['ADHD_Outcome'] = preds_test_adhd

In [None]:
final.to_csv('EnsembleClassifier_30Apr.csv', index=False)