# Import packages

In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from hyperopt import fmin, tpe, hp
from sklearn.metrics import accuracy_score
from hyperopt import STATUS_OK
from google.colab import drive

# Get data

In [22]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
!ls "/content/drive/My Drive/dreamAD"

'001_feature_selection copy2.ipynb'   final_merged_data_MTG.csv
'001_feature_selection copy.ipynb'    first_feat_sel
 filtered_links_A9.csv		      hiperparametros_optimizados_a9.csv
 filtered_links_MTG.csv		      hiperparametros_optimizados_mtg.csv
 final_datasets			      literature_feature_selection
 final_merged_data_A9.csv	      modified_metadata.csv
 final_merged_data.csv


In [46]:
#Read file
file_path = '/content/drive/My Drive/dreamAD/final_datasets/dataset_mtg.csv'
data = pd.read_csv(file_path)

In [25]:
data.head()

Unnamed: 0,AC020704.1_AC104689.2_L5-IT,AC008415.1_AC104689.2_L5-IT,AC019211.1_AC091078.1_L5-IT,AC008415.1_AC019211.1_L5-IT,AC019211.1_AC104689.2_L5-IT,AC008415.1_AL033539.2_L5-IT,AC019211.1_AC020704.1_L5-IT,AC008415.1_AC020704.1_L5-IT,AC024901.1_KCNIP4_L2-3-IT,CALCRL_KCNIP4_L2-3-IT,...,LATE_2,LATE_3,LATE_4,Lewy_0,Lewy_1,Lewy_2,Lewy_3,Lewy_4,Lewy_5,Lewy_6
0,,0.027397,,0.02146,0.014194,,0.005928,0.010641,,,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,,0.014107,0.003692,0.008395,0.009133,,0.009623,0.02252,0.004335,0.017046,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,,0.028872,0.022504,0.057016,0.052568,,0.010126,,0.003821,0.002972,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,,,,0.025583,0.013749,,0.019403,,,0.002686,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,,,,,,,,,,0.009475,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
#Set targets
target_cols = ['Thal', 'Braak', 'CERAD', 'ADNC',
               'percent 6e10 positive area',
               'percent AT8 positive area',
               'percent NeuN positive area',
               'percent GFAP positive area',
               'percent aSyn positive area',
               'percent pTDP43 positive area']

In [34]:
#Check data to identify any inconsistency like NaNs
print("Information about the target columns:\n")
for col in target_cols:
    dtype = data[col].dtype
    non_null_count = data[col].count()
    print(f"Column: '{col}'")
    print(f"Data type: {dtype}")
    print(f"# of non-nulls: {non_null_count}\n")


Information about the target columns:

Column: 'Thal'
Data type: float64
# of non-nulls: 84

Column: 'Braak'
Data type: float64
# of non-nulls: 84

Column: 'CERAD'
Data type: float64
# of non-nulls: 84

Column: 'ADNC'
Data type: float64
# of non-nulls: 84

Column: 'percent 6e10 positive area'
Data type: float64
# of non-nulls: 84

Column: 'percent AT8 positive area'
Data type: float64
# of non-nulls: 84

Column: 'percent NeuN positive area'
Data type: float64
# of non-nulls: 84

Column: 'percent GFAP positive area'
Data type: float64
# of non-nulls: 84

Column: 'percent aSyn positive area'
Data type: float64
# of non-nulls: 84

Column: 'percent pTDP43 positive area'
Data type: float64
# of non-nulls: 84



In [35]:
#Exclude the target variables AND the “Donor ID” column from the feature table
columns_to_drop = target_cols + ['Donor ID']
X_features = data.drop(columns=columns_to_drop, errors='ignore') #The `errors=“ignore”` is useful if the column does not exist in the DataFrame, avoiding an error.

In [36]:
#Define the parameters for hyperparameterisation
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

In [37]:
#List for storing hyperparameter results
results = []

In [38]:
#Initialise the model and GridSearchCV
model = xgb.XGBRegressor(objective='reg:squarederror')
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           cv=7,
                           n_jobs=-1,
                           verbose=1)

In [39]:
#Loop for each target variable
for target_name in target_cols:
    print(f"--- Hyperparameterisation for the target variable: {target_name} ---")

    #Set target name
    y = data[target_name]

    valid_indices = y.dropna().index

    if len(valid_indices) < 2:
        print(f"There is insufficient data for {target_name}. Skipping this variable.")
        continue

    X_cleaned = X_features.loc[valid_indices]
    y_cleaned = y.loc[valid_indices]

    try:
        X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=42)
    except ValueError as e:
        print(f"Error en train_test_split for {target_name}: {e}. Skipping.")
        continue

    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    row = {'target': target_name, 'best_score': best_score}
    row.update(best_params)
    results.append(row)

    print(f"Best parameters for  {target_name}: {best_params}")
    print(f"Best cross-validation score for {target_name}: {best_score:.4f}\n")

hyper_df = pd.DataFrame(results)
print("Hyperparameterisation results saved in object 'hyper_df'")

--- Hyperparameterisation for the target variable: Thal ---
Fitting 7 folds for each of 243 candidates, totalling 1701 fits
Best parameters for  Thal: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.8}
Best cross-validation score for Thal: 0.1468

--- Hyperparameterisation for the target variable: Braak ---
Fitting 7 folds for each of 243 candidates, totalling 1701 fits
Best parameters for  Braak: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
Best cross-validation score for Braak: 0.1645

--- Hyperparameterisation for the target variable: CERAD ---
Fitting 7 folds for each of 243 candidates, totalling 1701 fits
Best parameters for  CERAD: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
Best cross-validation score for CERAD: 0.2050

--- Hyperparameterisation for the target variable: ADNC ---
Fitting 7 folds for each of 243 

In [40]:
hyper_df

Unnamed: 0,target,best_score,colsample_bytree,learning_rate,max_depth,n_estimators,subsample
0,Thal,0.1468302,0.7,0.05,7,100,0.8
1,Braak,0.1644666,0.9,0.1,7,300,0.8
2,CERAD,0.2050046,0.8,0.05,7,100,0.7
3,ADNC,0.1829452,0.9,0.1,3,100,0.7
4,percent 6e10 positive area,-0.1753088,0.8,0.01,3,100,0.8
5,percent AT8 positive area,-0.1938103,0.7,0.01,3,100,0.7
6,percent NeuN positive area,0.07683567,0.7,0.05,7,100,0.7
7,percent GFAP positive area,-0.2399026,0.8,0.01,3,100,0.8
8,percent aSyn positive area,0.1647521,0.9,0.01,5,200,0.9
9,percent pTDP43 positive area,-11442050.0,0.9,0.01,3,200,0.9


In [41]:
import os
os.path.join("/content/drive/My Drive/dreamAD", "hiperparametros_optimizados_mtg.csv")

'/content/drive/My Drive/dreamAD/hiperparametros_optimizados_mtg.csv'

In [42]:
results_df.to_csv("/content/drive/My Drive/dreamAD/hiperparametros_optimizados_mtg.csv", index=False)