# Unified Model Training Notebook
This notebook merges the training of Random Forest (RF), Support Vector Classifier (SVC), and XGBoost (XGB) models. All preprocessing steps are shared.

In [None]:
# Import Required Libraries
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

## Data Loading and Preprocessing

In [None]:
# Load training data
cat_metadata = pd.read_excel('widsdatathon2025/TRAIN_NEW/TRAIN_CATEGORICAL_METADATA_new.xlsx')
quant_metadata = pd.read_excel('widsdatathon2025/TRAIN_NEW/TRAIN_QUANTITATIVE_METADATA_new.xlsx')
fmri_connectome = pd.read_csv('widsdatathon2025/TRAIN_NEW/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv')
solutions = pd.read_excel('widsdatathon2025/TRAIN_NEW/TRAINING_SOLUTIONS.xlsx')

# One hot encoding for categorical data
for col in cat_metadata.select_dtypes(include='int').columns:
    cat_metadata[col] = cat_metadata[col].astype('category')
columns_to_encode = cat_metadata.columns[1:].tolist()
cat_encoded = pd.get_dummies(cat_metadata[columns_to_encode], drop_first=True)
cat_encoded = cat_encoded.applymap(lambda x: 1 if x is True else (0 if x is False else x))
cat_final = pd.concat([cat_metadata.drop(columns=columns_to_encode), cat_encoded], axis=1)

# Merge categorical, quantitative, and solution data
sociodemograph = pd.merge(cat_final, quant_metadata, on='participant_id')
sociodemograph = pd.merge(sociodemograph, solutions, on='participant_id')
connectome = pd.merge(fmri_connectome, solutions, on='participant_id')

In [None]:
# Fill missing values in categorical columns with mode
cat_cols = [
    'PreInt_Demos_Fam_Child_Ethnicity', 'PreInt_Demos_Fam_Child_Race',
    'MRI_Track_Scan_Location', 'Barratt_Barratt_P1_Edu', 'Barratt_Barratt_P1_Occ',
    'Barratt_Barratt_P2_Edu', 'Barratt_Barratt_P2_Occ'
 ]
for col in cat_cols:
    if col in sociodemograph.columns:
        sociodemograph[col] = sociodemograph[col].fillna(sociodemograph[col].mode()[0])
# Fill missing values in quantitative columns with mean
quant_cols = [
    'EHQ_EHQ_Total', 'ColorVision_CV_Score', 'APQ_P_APQ_P_CP', 'APQ_P_APQ_P_ID',
    'APQ_P_APQ_P_INV', 'APQ_P_APQ_P_OPD', 'APQ_P_APQ_P_PM', 'APQ_P_APQ_P_PP',
    'SDQ_SDQ_Conduct_Problems', 'SDQ_SDQ_Difficulties_Total', 'SDQ_SDQ_Emotional_Problems',
    'SDQ_SDQ_Externalizing', 'SDQ_SDQ_Generating_Impact', 'SDQ_SDQ_Hyperactivity',
    'SDQ_SDQ_Internalizing', 'SDQ_SDQ_Peer_Problems', 'SDQ_SDQ_Prosocial',
    'MRI_Track_Age_at_Scan'
 ]
for col in quant_cols:
    if col in sociodemograph.columns:
        sociodemograph[col].fillna(sociodemograph[col].mean(), inplace=True)

## Feature Selection (Manual)

In [None]:
# Remove selected features from sociodemographic data
features_to_remove = ["Basic_Demos_Study_Site_2", "Basic_Demos_Study_Site_3", "Basic_Demos_Study_Site_4", "MRI_Track_Scan_Location"]
sociodemograph_manual = sociodemograph.drop(columns=features_to_remove, errors='ignore')

In [None]:
# Manual feature selection for connectivity data
manual_conn_indices = [
    '76', '185', '75', '184', '80', '81', '82', '187', '188', '189', '77', '78', '79', '186', '191',
    '88', '89', '90', '91', '192', '87', '83', '84', '85', '86', '190', '92', '93', '94', '95', '193',
    '96', '194', '98', '196', '97', '195', '61', '167', '58', '59', '60', '165', '166', '63', '64',
    '65', '168', '169', '62', '66', '170', '57', '68', '173', '174', '175', '176', '180', '69', '70',
    '71', '177', '178', '179', '67', '171', '172', '74', '183', '72', '73', '181', '182', '31', '132',
    '32', '33', '34', '133', '134', '135', '136', '29', '30', '131', '39', '141', '35', '36', '37',
    '38', '137', '138', '139', '140' ]
conn_manual_cols = ['participant_id', 'Sex_F', 'ADHD_Outcome']
conn_feature_cols = [col for col in connectome.columns if col not in ['participant_id', 'Sex_F', 'ADHD_Outcome']]
for col in conn_feature_cols:
    for idx in manual_conn_indices:
        if idx in col:
            index = col.find(idx)
            before = col[:index]
            after = col[index + len(idx):]
            if (not before or not before[-1].isdigit()) and (not after or not after[0].isdigit()):
                conn_manual_cols.append(col)
                break
conn_manual_cols = list(set(conn_manual_cols))
conn_manual_single = connectome[conn_manual_cols]

## Model Training Functions

In [None]:
# Sociodemographic data: ADHD_Outcome
X_soc = sociodemograph_manual.drop(columns=['participant_id', 'ADHD_Outcome'])
y_soc = sociodemograph_manual['ADHD_Outcome'].astype(int)

# Connectivity data: Sex_F
X_conn = conn_manual_single.drop(columns=['participant_id', 'Sex_F'])
y_conn = conn_manual_single['Sex_F']

# Models
models = {
    'RF': RandomForestClassifier(class_weight='balanced', random_state=42),
    'SVC': SVC(kernel='linear', class_weight='balanced', random_state=42),
    'XGB': XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
}

results = {}
for name, model in models.items():
    print(f'\n=== {name} on sociodemographic data (ADHD_Outcome) ===')
    acc, f1 = cross_val_model(model, X_soc, y_soc)
    print(f'F1-score mean: {np.mean(f1):.4f}, std: {np.std(f1):.4f}')
    results[f'sociodemograph_manual_single_{name}_adhd'] = f1
    print(f'\n=== {name} on connectivity data (Sex_F) ===')
    acc, f1 = cross_val_model(model, X_conn, y_conn)
    print(f'F1-score mean: {np.mean(f1):.4f}, std: {np.std(f1):.4f}')
    results[f'conn_manual_single_{name}_sex'] = f1

## Train and Evaluate All Models

In [None]:
# Demographic data: ADHD_Outcome
X_demo = train_final_demo_selected.drop(columns=['participant_id', 'ADHD_Outcome'])
y_demo = train_final_demo_selected['ADHD_Outcome'].astype(int)

# Connectivity data: Sex_F
X_conn = train_final_connett_selected.drop(columns=['participant_id', 'Sex_F'])
y_conn = train_final_connett_selected['Sex_F']

# Models
models = {
    'RandomForest': RandomForestClassifier(class_weight='balanced', random_state=42),
    'SVC': SVC(kernel='linear', class_weight='balanced', random_state=42),
    'XGBoost': XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)
}

results = {}
for name, model in models.items():
    print(f'
=== {name} on Demographic Data (ADHD_Outcome) ===')
    acc, f1 = cross_val_model(model, X_demo, y_demo)
    print(f'F1-score mean: {np.mean(f1):.4f}, std: {np.std(f1):.4f}')
    results[f'{name}_ADHD'] = f1
    print(f'
=== {name} on Connectivity Data (Sex_F) ===')
    acc, f1 = cross_val_model(model, X_conn, y_conn)
    print(f'F1-score mean: {np.mean(f1):.4f}, std: {np.std(f1):.4f}')
    results[f'{name}_Sex'] = f1

## Save Results

In [None]:
# Save F1-score results for each model, using the naming convention of the figures notebook
import os
os.makedirs('results_paper', exist_ok=True)
for key, f1_list in results.items():
    if 'conn_manual_single' in key and '_SVC_' in key:
        colname = 'F1_sex_score'
    elif 'conn_manual_single' in key and '_RF_' in key:
        colname = 'F1_sex_score'
    elif 'conn_manual_single' in key and '_XGB_' in key:
        colname = 'F1_sex_score'
    elif 'sociodemograph_manual_single' in key and '_SVC_' in key:
        colname = 'F1_adhd_score'
    elif 'sociodemograph_manual_single' in key and '_RF_' in key:
        colname = 'F1_adhd_score'
    elif 'sociodemograph_manual_single' in key and '_XGB_' in key:
        colname = 'F1_adhd_score'
    else:
        colname = 'F1_score'
    df = pd.DataFrame({colname: f1_list})
    df.to_csv(f'results_paper/{key}.csv', index=False)
    print(f'Saved: results_paper/{key}.csv')