In [112]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneGroupOut

In [None]:
#savedir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V0_trained_on_UKB40/'
#savedir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation_latent_256/'
savedir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/'
#dfs_dir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V0_trained_on_UKB40/embeddings/ukb40_epoch80_embeddings'
#dfs_dir = '/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation_latent_256/embeddings/ukb40_epoch80_embeddings'
n_dims = 32
#nb_subs = None
max_iter = 100
#metric = 'roc_auc_ovr_weighted'
metric = 'roc_auc'
n_components = 0.999

regions_to_treat = ['SOr_left',
 'SOr_right',
 'FColl-SRh_left',
 'SFmedian-SFpoltr-SFsup_left',
 'SFinf-BROCA-SPeCinf_left',
 'SPoC_left',
 'fronto-parietal_medial_face_left',
 'FIP_left',
 'CINGULATE_left',
 'SC-SPoC_left',
 'SFinter-SFsup_left',
 'FCMpost-SpC_left',
 'SsP-SPaint_left',
 'SOr-SOlf_left',
 'FPO-SCu-ScCal_left',
 'LARGE_CINGULATE_left',
 'SFmarginal-SFinfant_left',
 'SFint-FCMant_left',
 'STi-STs-STpol_left',
 'SFint-SR_left',
 'Lobule_parietal_sup_left',
 'STi-SOTlat_left',
 'SPeC_left',
 'STsbr_left',
 'ScCal-SLi_left',
 'STs_left',
 'FCLp-subsc-FCLa-INSULA_left',
 'SC-sylv_left',
 'SC-SPeC_left',
 'OCCIPITAL_left',
 'FColl-SRh_right',
 'SFmedian-SFpoltr-SFsup_right',
 'SFinf-BROCA-SPeCinf_right',
 'SPoC_right',
 'fronto-parietal_medial_face_right',
 'FIP_right',
 'CINGULATE_right',
 'SC-SPoC_right',
 'SFinter-SFsup_right',
 'FCMpost-SpC_right',
 'SsP-SPaint_right',
 'SOr-SOlf_right',
 'FPO-SCu-ScCal_right',
 'LARGE_CINGULATE_right',
 'SFmarginal-SFinfant_right',
 'SFint-FCMant_right',
 'STi-STs-STpol_right',
 'SFint-SR_right',
 'Lobule_parietal_sup_right',
 'STi-SOTlat_right',
 'SPeC_right',
 'STsbr_right',
 'ScCal-SLi_right',
 'STs_right',
 'FCLp-subsc-FCLa-INSULA_right',
 'SC-sylv_right',
 'SC-SPeC_right',
 'OCCIPITAL_right']

# Load UKB embeds to perform PCA

In [83]:
# fit PCA and standard scaler on UKB
dataset = 'ukb40'
keywords = ['_left', '_right']
all_matches = []
for var in keywords:
    pattern = f"{savedir}*{var}/*/{dataset}_random_embeddings/train_val_embeddings.csv" ## TODO : Make sure only one model per region, take highest epoch, and print
    matches = glob.glob(pattern)
    all_matches.extend(matches)
# Optional: remove duplicates
dfs_dirs = list(set(all_matches))
dfs_dirs.sort()

regions_treated = [elem.split(savedir)[1].split('/')[0] for elem in dfs_dirs]
print(regions_treated)
print(f'Number of regions treated : {len(regions_treated)}')
print(f'Missing regions : {set(regions_to_treat) - set(regions_treated)}')

embd_list = []
print('Loading the embeddings...')
for i, directory in enumerate(tqdm(dfs_dirs)):
    embd=pd.read_csv(directory)
    embd.columns = ['ID'] + [f'dim_{i}_{j}' for j in range(n_dims)]
    embd_list.append(embd)
## merge all the dfs
print('Merging the embeddings...')
embd = embd_list[0]
for i in tqdm(range(1, len(embd_list))):
    embd = pd.merge(embd, embd_list[i], on='ID', how='outer')
# drop all 'ID' columns but the first
embd = embd.loc[:, ~embd.columns.duplicated()]

['CINGULATE_left', 'CINGULATE_right', 'FCLp-subsc-FCLa-INSULA_left', 'FCLp-subsc-FCLa-INSULA_right', 'FCMpost-SpC_left', 'FCMpost-SpC_right', 'FColl-SRh_left', 'FColl-SRh_right', 'FIP_left', 'FIP_right', 'FPO-SCu-ScCal_left', 'FPO-SCu-ScCal_right', 'LARGE_CINGULATE_left', 'LARGE_CINGULATE_right', 'Lobule_parietal_sup_left', 'Lobule_parietal_sup_right', 'OCCIPITAL_left', 'OCCIPITAL_right', 'SC-SPeC_left', 'SC-SPeC_right', 'SC-SPoC_left', 'SC-SPoC_right', 'SC-sylv_left', 'SC-sylv_right', 'SFinf-BROCA-SPeCinf_left', 'SFinf-BROCA-SPeCinf_right', 'SFint-FCMant_left', 'SFint-FCMant_right', 'SFint-SR_left', 'SFint-SR_right', 'SFinter-SFsup_left', 'SFinter-SFsup_right', 'SFmarginal-SFinfant_left', 'SFmarginal-SFinfant_right', 'SFmedian-SFpoltr-SFsup_left', 'SFmedian-SFpoltr-SFsup_right', 'SOr-SOlf_left', 'SOr-SOlf_right', 'SOr_left', 'SOr_right', 'SPeC_left', 'SPeC_right', 'SPoC_left', 'SPoC_right', 'STi-SOTlat_left', 'STi-SOTlat_right', 'STi-STs-STpol_left', 'STi-STs-STpol_right', 'STs_left',

100%|██████████| 58/58 [00:11<00:00,  5.13it/s]


Merging the embeddings...


100%|██████████| 57/57 [00:22<00:00,  2.54it/s]


In [84]:
embd

Unnamed: 0,ID,dim_0_0,dim_0_1,dim_0_2,dim_0_3,dim_0_4,dim_0_5,dim_0_6,dim_0_7,dim_0_8,...,dim_57_22,dim_57_23,dim_57_24,dim_57_25,dim_57_26,dim_57_27,dim_57_28,dim_57_29,dim_57_30,dim_57_31
0,sub-1000021,11.229898,29.822453,-4.026312,3.358199,0.231886,1.422046,-18.252083,12.020048,39.133064,...,-110.485040,-11.918763,-19.310192,-60.771812,44.121420,44.044420,6.857598,1.856153,-89.517800,-75.988945
1,sub-1000325,-31.725147,-0.395030,-0.939265,1.984952,47.066414,-37.844696,11.580789,8.266258,-1.891401,...,-3.287565,-64.797554,-16.065058,42.844530,97.249886,29.268694,-25.276054,-71.845215,-18.877314,21.536497
2,sub-1000458,-14.268427,13.239408,-7.325253,-18.616476,29.325350,3.314644,-7.072170,48.857030,-46.308220,...,93.527120,3.516595,-30.533266,-18.535421,28.811523,8.226424,4.597275,18.266330,-83.983820,35.153378
3,sub-1000575,-13.318561,42.696262,-25.401484,0.247414,-8.130686,0.259071,15.793086,2.559356,-76.482475,...,-48.908123,4.483787,31.965273,-38.034103,39.104008,56.355297,17.254475,-39.391186,-26.785755,29.222292
4,sub-1000606,-11.812742,-19.045920,-39.806570,-1.982969,-14.401817,24.759258,27.615164,21.279202,-44.039230,...,-84.282455,-24.559574,4.021320,-38.716858,-12.204517,7.535121,-39.761997,-13.399411,-131.010500,-39.199780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42428,sub-6023847,33.613890,-23.557701,7.175593,9.799522,-9.613125,-4.278757,-30.092983,8.539750,-43.421410,...,-57.007263,18.685423,48.098194,-22.259700,-4.153656,5.956959,-89.351170,-48.549694,-72.186935,25.574633
42429,sub-6024038,11.936286,-44.121895,-28.201280,-13.278025,9.748560,-11.524530,-6.065997,-41.201775,-65.835770,...,48.694958,22.324165,-62.345936,9.976994,38.015446,28.031918,44.732548,-4.998083,9.547419,48.156914
42430,sub-6024150,-26.452390,43.172460,-21.045660,-17.208462,-18.573729,-5.742548,1.993332,14.503933,-43.894146,...,-19.296130,-20.586903,-9.267251,-37.462880,-73.314026,-3.572524,49.133590,-23.293330,-20.440924,54.866966
42431,sub-6024379,17.589401,19.859537,-16.085007,1.269508,13.019363,-8.146880,-43.884666,17.929020,-54.596516,...,-57.203580,12.706079,-7.453095,22.556630,-9.304359,21.675316,18.480112,17.824127,-65.650390,43.339450


In [85]:
# fit standard scaler on embd
std = StandardScaler()
embd_matrix = embd.loc[:, embd.columns.str.startswith('dim')]
std.fit(embd_matrix)

In [86]:
# fit PCA on embd
pca = PCA(n_components=n_components)
pca.fit(std.transform(embd_matrix))

In [87]:
reduced_matrix = pca.transform((std.transform(embd_matrix)))

In [88]:
reduced_matrix.shape

(42433, 1290)

# Load data and labels

In [89]:
dataset = 'agg_abide'

keywords = ['_left', '_right']
all_matches = []
for var in keywords:
    pattern = f"{savedir}*{var}/*/{dataset}_random_embeddings/full_embeddings.csv" ## TODO : Make sure only one model per region, take highest epoch, and print
    matches = glob.glob(pattern)
    all_matches.extend(matches)
# Optional: remove duplicates
dfs_dirs = list(set(all_matches))
dfs_dirs.sort()

regions_treated = [elem.split(savedir)[1].split('/')[0] for elem in dfs_dirs]
print(regions_treated)
print(f'Number of regions treated : {len(regions_treated)}')
print(f'Missing regions : {set(regions_to_treat) - set(regions_treated)}')

# load labels
labels1 = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/abide1/20231108_participants.tsv', usecols=['participant_id', 'diagnosis'], sep='\t')
labels2 = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/abide2/20231108_participants.tsv', usecols=['participant_id', 'diagnosis'], sep='\t')
labels = pd.concat([labels1, labels2], axis=0)
labels.columns = ['ID'] + labels.columns[1:].tolist()
label = 'diagnosis'

## load and rename the columns of each df
embd_list = []
print('Loading the embeddings...')
for i, directory in enumerate(tqdm(dfs_dirs)):
    embd=pd.read_csv(directory)
    embd.columns = ['ID'] + [f'dim_{i}_{j}' for j in range(n_dims)]
    embd_list.append(embd)
## merge all the dfs
print('Merging the embeddings...')
embd = embd_list[0]
for i in tqdm(range(1, len(embd_list))):
    embd = pd.merge(embd, embd_list[i], on='ID', how='outer')
# drop all 'ID' columns but the first
embd = embd.loc[:, ~embd.columns.duplicated()]
embd['ID'] = embd['ID'].astype(str)

['CINGULATE_left', 'CINGULATE_right', 'FCLp-subsc-FCLa-INSULA_left', 'FCLp-subsc-FCLa-INSULA_right', 'FCMpost-SpC_left', 'FCMpost-SpC_right', 'FColl-SRh_left', 'FColl-SRh_right', 'FIP_left', 'FIP_right', 'FPO-SCu-ScCal_left', 'FPO-SCu-ScCal_right', 'LARGE_CINGULATE_left', 'LARGE_CINGULATE_right', 'Lobule_parietal_sup_left', 'Lobule_parietal_sup_right', 'OCCIPITAL_left', 'OCCIPITAL_right', 'SC-SPeC_left', 'SC-SPeC_right', 'SC-SPoC_left', 'SC-SPoC_right', 'SC-sylv_left', 'SC-sylv_right', 'SFinf-BROCA-SPeCinf_left', 'SFinf-BROCA-SPeCinf_right', 'SFint-FCMant_left', 'SFint-FCMant_right', 'SFint-SR_left', 'SFint-SR_right', 'SFinter-SFsup_left', 'SFinter-SFsup_right', 'SFmarginal-SFinfant_left', 'SFmarginal-SFinfant_right', 'SFmedian-SFpoltr-SFsup_left', 'SFmedian-SFpoltr-SFsup_right', 'SOr-SOlf_left', 'SOr-SOlf_right', 'SOr_left', 'SOr_right', 'SPeC_left', 'SPeC_right', 'SPoC_left', 'SPoC_right', 'STi-SOTlat_left', 'STi-SOTlat_right', 'STi-STs-STpol_left', 'STi-STs-STpol_right', 'STs_left',

100%|██████████| 58/58 [00:01<00:00, 56.09it/s]


Merging the embeddings...


100%|██████████| 57/57 [00:00<00:00, 90.72it/s] 


In [90]:
embd.shape

(1887, 1857)

In [91]:
## restrict embeddings to one single run
# remove the 'run' information
embd['ID'] = embd['ID'].str.split('_ses').str[0]
# remove rows with duplicate ID
embd = embd.drop_duplicates(subset=['ID'], keep='first')

In [92]:
embd.shape

(1828, 1857)

In [93]:
# select the train val subjects
train_val_subjects_dirs = glob.glob('/neurospin/dico/data/deep_folding/current/datasets/aggregate_autism/splits/train_val_*')
# load each split subject file and create cv splits from them
train_val_subjects = []
for i, directory in enumerate(train_val_subjects_dirs):
    train_val_subjects.append(pd.read_csv(directory, sep='\t', header=None))
train_val_subjects = pd.concat(train_val_subjects, axis=0)
train_val_subjects.columns = ['ID']
train_val_subjects['ID'] = train_val_subjects['ID'].astype(str)
print(len(train_val_subjects))

1462


In [94]:
# select the test subjects
test_subjects = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/aggregate_autism/splits/internal_test.csv', header=None)
test_subjects.columns = ['ID']
test_subjects['ID'] = test_subjects['ID'].astype(str)
# select the test extra
test_extra_subjects = pd.read_csv('/neurospin/dico/data/deep_folding/current/datasets/aggregate_autism/splits/external_test.csv', header=None)
test_extra_subjects.columns = ['ID']
test_extra_subjects['ID'] = test_extra_subjects['ID'].astype(str)

In [95]:
# add a train val label
embd["train_val"] = embd['ID'].apply(lambda x: 1 if any((i in x) for i in train_val_subjects['ID'].tolist()) else 0)
# add a inter set label
embd["test"] = embd['ID'].apply(lambda x: 1 if any((i in x) for i in test_subjects['ID'].tolist()) else 0)
# add an extra set label
embd["test_extra"] = embd['ID'].apply(lambda x: 1 if any((i in x) for i in test_extra_subjects['ID'].tolist()) else 0)

In [96]:
# add label
embd = pd.merge(embd, labels, on='ID', how='left')

# Step 1 : cross validation on train_val to get the best params

In [97]:
embd_train_val = embd.loc[embd['train_val']==1]

In [98]:
# get the custom cv
splits_basedir = '/neurospin/dico/data/deep_folding/current/datasets/aggregate_autism/splits/train_val_split_'
root_dir = '/'.join(splits_basedir.split('/')[:-1])
basedir = splits_basedir.split('/')[-1]
splits_dirs = [os.path.join(root_dir,f) for f in os.listdir(root_dir) if f.startswith(basedir) and '.csv' in f]
splits_subs = [pd.read_csv(file, header=None) for file in splits_dirs]
labels = np.concatenate([[i] * len(K) for i, K in enumerate(splits_subs)])
splits_subs_and_labels = pd.concat(splits_subs)
splits_subs_and_labels.columns=['ID']
splits_subs_and_labels['ID'] = splits_subs_and_labels['ID'].astype(str)
splits_subs_and_labels['ID'] = ['sub-'+elem for elem in splits_subs_and_labels['ID']]
splits_subs_and_labels['splits'] = labels

# merge with embd
embd_train_val = pd.merge(embd_train_val, splits_subs_and_labels, on='ID', how='left')

In [99]:
embd_train_val

Unnamed: 0,ID,dim_0_0,dim_0_1,dim_0_2,dim_0_3,dim_0_4,dim_0_5,dim_0_6,dim_0_7,dim_0_8,...,dim_57_27,dim_57_28,dim_57_29,dim_57_30,dim_57_31,train_val,test,test_extra,diagnosis,splits
0,sub-28683,15.015651,6.900095,-46.592373,42.837640,33.119728,26.453585,17.152172,3.248585,-87.239040,...,-31.725200,17.846579,2.011991,-28.992413,70.237590,1,0,0,asd,8
1,sub-28683,15.015651,6.900095,-46.592373,42.837640,33.119728,26.453585,17.152172,3.248585,-87.239040,...,-31.725200,17.846579,2.011991,-28.992413,70.237590,1,0,0,asd,9
2,sub-28684,9.377574,-25.403397,-14.165288,1.883093,-6.337694,7.797254,-48.640865,-6.658917,-17.555489,...,-53.485596,11.596569,7.277382,-78.541690,-46.233390,1,0,0,asd,4
3,sub-28684,9.377574,-25.403397,-14.165288,1.883093,-6.337694,7.797254,-48.640865,-6.658917,-17.555489,...,-53.485596,11.596569,7.277382,-78.541690,-46.233390,1,0,0,asd,4
4,sub-28684,9.377574,-25.403397,-14.165288,1.883093,-6.337694,7.797254,-48.640865,-6.658917,-17.555489,...,-53.485596,11.596569,7.277382,-78.541690,-46.233390,1,0,0,asd,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,sub-51582,44.981950,13.728046,10.769737,-8.572695,-5.516324,25.242517,-41.176792,27.095789,2.380854,...,-34.776165,1.354430,6.161531,-41.733500,18.881784,1,0,0,asd,7
1455,sub-51583,-1.595289,-17.458889,-16.158890,12.156231,-27.836834,3.856123,38.375572,2.055460,-58.089980,...,50.637436,-40.214367,-8.594928,-39.673730,57.720566,1,0,0,asd,0
1456,sub-51585,3.705244,10.538881,-0.136732,20.532404,53.430218,8.831498,6.921094,-5.469935,-5.424624,...,-41.935833,-41.980442,-44.014507,42.738617,161.361150,1,0,0,asd,7
1457,sub-51606,7.465191,11.607732,9.680292,6.612815,-11.444448,-13.424266,0.185027,13.540228,4.432408,...,30.490343,-23.019007,-0.105496,-84.031720,66.739940,1,0,0,asd,2


In [100]:
X = embd_train_val.loc[:, embd_train_val.columns.str.startswith('dim')]
Y = embd_train_val.loc[:, label]

In [101]:
# apply std and pca to X
X = std.transform(X)
X = pca.transform(X)

In [102]:
X.shape

(1459, 1290)

In [103]:
groups = embd_train_val.loc[:, 'splits']
logo = LeaveOneGroupOut()
cv = [*(logo.split(X, Y, groups=groups))]

In [104]:
model = LogisticRegression(solver='saga', penalty='elasticnet', max_iter=max_iter)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'l1_ratio': [0., 0.2, 0.4, 0.6, 0.8, 1.],
}

clf = GridSearchCV(model, param_grid, cv=cv, refit=False, scoring=metric, n_jobs=-1)
clf.fit(X, Y)
best_params = clf.best_params_
print(f'Best parameters: {best_params}')
print(f'Best score: {clf.best_score_}')



Best parameters: {'C': 0.01, 'l1_ratio': 0.0}
Best score: 0.5955758292595122




# Step 2 : use best params, train on train_val, and eval on test inter

In [105]:
model = LogisticRegression(solver='saga', penalty='elasticnet', max_iter=max_iter, C=clf.best_params_['C'], l1_ratio=clf.best_params_['l1_ratio'])
model.fit(X, Y)

In [109]:
## eval overfitting on train val set
Y_pred_proba = model.predict_proba(X)
roc_auc = roc_auc_score(Y, Y_pred_proba[:, 1])
print(f'ROC AUC on train val set: {roc_auc}')

ROC AUC on train val set: 0.9197291558034263


In [110]:
## eval on test set
embd_test = embd.loc[embd['test']==1]
X_test = embd_test.loc[:, embd_test.columns.str.startswith('dim')]
# apply std and pca to X
X_test = std.transform(X_test)
X_test = pca.transform(X_test)

Y_test = embd_test.loc[:, label]
Y_test_pred_proba = model.predict_proba(X_test)
# compute roc_auc
roc_auc = roc_auc_score(Y_test, Y_test_pred_proba[:, 1])
print(f'ROC AUC on test set: {roc_auc}')

ROC AUC on test set: 0.59344314222363


In [111]:
## eval on test extra set
embd_test_extra = embd.loc[embd['test_extra']==1]
X_test_extra = embd_test_extra.loc[:, embd_test_extra.columns.str.startswith('dim')]
# apply std and pca to X
X_test_extra = std.transform(X_test_extra)
X_test_extra = pca.transform(X_test_extra)

Y_test_extra = embd_test_extra.loc[:, label]
Y_test_extra_pred_proba = model.predict_proba(X_test_extra)
# compute roc_auc
roc_auc_extra = roc_auc_score(Y_test_extra, Y_test_extra_pred_proba[:, 1])
print(f'ROC AUC on test extra set: {roc_auc_extra}')

ROC AUC on test extra set: 0.5429687499999999


# MLP

In [128]:
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=42)
param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 10, 100, 1000],
    'hidden_layer_sizes': [(10,), (20,), (30,), (60,), (100,)],}
clf = GridSearchCV(model, param_grid, cv=cv, refit=False, scoring=metric, n_jobs=-1)
clf.fit(X, Y)

best_params = clf.best_params_
print(f'Best parameters: {best_params}')
print(f'Best score: {clf.best_score_}')



Best parameters: {'alpha': 10, 'hidden_layer_sizes': (30,)}
Best score: 0.617146355974


In [133]:
model = MLPClassifier(max_iter=200, random_state=42, alpha=clf.best_params_['alpha'], hidden_layer_sizes=clf.best_params_['hidden_layer_sizes'])
model.fit(X, Y)

In [134]:
## eval overfitting on train val set
Y_pred_proba = model.predict_proba(X)
roc_auc = roc_auc_score(Y, Y_pred_proba[:, 1])
print(f'ROC AUC on train val set: {roc_auc}')

ROC AUC on train val set: 1.0


In [135]:
## eval on test set
embd_test = embd.loc[embd['test']==1]
X_test = embd_test.loc[:, embd_test.columns.str.startswith('dim')]
# apply std and pca to X
X_test = std.transform(X_test)
X_test = pca.transform(X_test)

Y_test = embd_test.loc[:, label]
Y_test_pred_proba = model.predict_proba(X_test)
# compute roc_auc
roc_auc = roc_auc_score(Y_test, Y_test_pred_proba[:, 1])
print(f'ROC AUC on test set: {roc_auc}')

ROC AUC on test set: 0.6069052898321191


In [136]:
## eval on test extra set
embd_test_extra = embd.loc[embd['test_extra']==1]
X_test_extra = embd_test_extra.loc[:, embd_test_extra.columns.str.startswith('dim')]
# apply std and pca to X
X_test_extra = std.transform(X_test_extra)
X_test_extra = pca.transform(X_test_extra)

Y_test_extra = embd_test_extra.loc[:, label]
Y_test_extra_pred_proba = model.predict_proba(X_test_extra)
# compute roc_auc
roc_auc_extra = roc_auc_score(Y_test_extra, Y_test_extra_pred_proba[:, 1])
print(f'ROC AUC on test extra set: {roc_auc_extra}')

ROC AUC on test extra set: 0.5661057692307693
