## This notebook allows to assess the effect of a given site for a classifcation task.

### 1 : Classify the site 

In [75]:
import os
import csv
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    cross_val_score
)
from sklearn.metrics import roc_auc_score

# This notebook presents three methods to evaluate the influence of a parameter on a classfication task : site classification, cross-val on the site value and residualisation. 

## General settings 

In [76]:
# CONFIG
labels_path = "/neurospin/dico/rmenasria/Runs/03_main/Input/ABCD/all_labels_clean_abcd.csv"
base_path   = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/embeddings/ABCD_embeddings/"
output_csv  = ""
site = ["site_id_l","demo_sex_v2","interview_age"]
region = "FCMpost-SpC_right"

In [77]:
labels_df = pd.read_csv(labels_path, index_col=0)
print(labels_df["interview_age"].isna().sum())

1


  labels_df = pd.read_csv(labels_path, index_col=0)


In [78]:
thresholds  = ["<27","27-32","32-37"]
threshold = thresholds [1]

## Preprocess data 

In [79]:
labels_df = pd.get_dummies(labels_df, columns=['site_id_l'], prefix='', prefix_sep='')

print(labels_df.columns)

    

Index(['demoi_p_select_language___1', 'demo_prim', 'demo_brthdat_v2',
       'demo_ed_v2', 'demo_adopt_agex_v2', 'demo_adopt_agex_v2_bl_dk',
       'demo_sex_v2', 'demo_gender_id_v2', 'demo_race_a_p___10',
       'demo_race_a_p___11',
       ...
       'site13', 'site14', 'site15', 'site16', 'site17', 'site18', 'site19',
       'site20', 'site21', 'site22'],
      dtype='object', length=576)


In [80]:
def define_sex_class_mapping(sex_series):


    unique_sex = sorted(sex_series.unique())

    print("unique sites :", unique_sex)

    mapping = {1.0: 0, 2.0:1, 3.0 : 1}
    
    def encoder_fn(site):
        return mapping[site]
    
    return mapping, encoder_fn


mapping, encode_site = define_sex_class_mapping(labels_df['demo_sex_v2'])
labels_df['sex'] = labels_df['demo_sex_v2'].apply(encode_site)

print(labels_df["sex"].value_counts())   

unique sites : [1.0, 2.0, 3.0]
sex
0    5248
1    4737
Name: count, dtype: int64


In [81]:
labels_df ["interview_age"].fillna(115,inplace =True)

def define_scan_age_mapping(scan_ages_series): 


    unique_scan_ages = sorted(scan_ages_series.unique())



    scaler = StandardScaler()
    scan_ages =np.array(scan_ages_series).reshape(-1 ,1)
    scaler.fit(scan_ages)
    print("unique_scan_ages :",unique_scan_ages)

    def encoder_fn(scan_age):
        return scaler.transform(np.array(scan_age).reshape(-1,1))
    
    return scaler, encoder_fn

scaler, encode_scan_age = define_scan_age_mapping(labels_df["interview_age"])

labels_df['scan_age']= labels_df['interview_age'].apply(encode_scan_age)





The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  labels_df ["interview_age"].fillna(115,inplace =True)


unique_scan_ages : [107.0, 108.0, 109.0, 110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0, 120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0, 128.0, 129.0, 130.0, 131.0, 132.0, 133.0]


In [82]:
def setup_task(region, site): 

    embedding_file = None
    for file in os.listdir(base_path):
        if file.startswith(region) and file.endswith(".csv"):
            embedding_file = file
            break
    if embedding_file is None:
        raise FileNotFoundError(f"No embedding file found for region: {region}")
    
    emb_path = os.path.join(base_path, embedding_file)
    emb_df = pd.read_csv(emb_path)
    emb_df['ID_clean'] = (
        emb_df['ID'].astype(str)
        .str.replace(r"^sub-", "", regex=True)
        .str.replace("_", "", regex=False)
    )

    df = emb_df.merge(
        labels_df,
        left_on='ID_clean', right_on='src_subject_id_clean', how='inner'
    )
    df["scan_age"] = df["scan_age"].apply(lambda x: float(x[0][0]) if isinstance(x, list) else x)

    if site is None :
        df["prematurity_class"]=df["prem_class"].str.replace("_sem","")
        df= df[df['prematurity_class'].isin([threshold, ">=37"])].copy()
        df['y'] = (df['prematurity_class'] == threshold).astype(int)

    else :     
        df['y'] = df[site].astype(int)
        
    X = df.filter(regex=r'^dim').values
    y = df['y'].values

    return X,y,df




In [83]:
def classify(region, site, n_jobs_inner=-1):
    print(f"Début traitement : region={region}")
    
    X, y, _ = setup_task(region,site)
    print(y.shape, X.shape)

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='linear', probability=True, class_weight='balanced'))
    ])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(
        pipe, {'svc__C': [0.01, 0.1, 1, 10]},
        cv=cv, scoring='roc_auc', n_jobs=n_jobs_inner
    )
    grid.fit(X, y)
    best_C = grid.best_params_['svc__C']
    cv_scores = cross_val_score(grid.best_estimator_, X, y,
                                cv=cv, scoring='roc_auc', n_jobs=n_jobs_inner)


    print(f"{region} : AUC {cv_scores.mean():.3f}")

    return {
        'region': region,
        'best_C': best_C,
        'cv_auc_mean': cv_scores.mean(),
        'cv_auc_std': cv_scores.std(),
    }

In [27]:
classify(region = "STi-STs-STpol_right", site="sex",  n_jobs_inner=-1)

Début traitement : region=STi-STs-STpol_right
(9985,) (9985, 32)
STi-STs-STpol_right : AUC 0.676


{'region': 'STi-STs-STpol_right',
 'best_C': 1,
 'cv_auc_mean': 0.6763462649354572,
 'cv_auc_std': 0.014449629050447953}

## Second approach : Folds dedicated

In [84]:
def setup_task_per_site(region, site): 

    embedding_file = None
    for file in os.listdir(base_path):
        if file.startswith(region) and file.endswith(".csv"):
            embedding_file = file
            break
    if embedding_file is None:
        raise FileNotFoundError(f"No embedding file found for region: {region}")
    
    emb_path = os.path.join(base_path, embedding_file)
    emb_df = pd.read_csv(emb_path)
    emb_df['ID_clean'] = (
        emb_df['ID'].astype(str)
        .str.replace(r"^sub-", "", regex=True)
        .str.replace("_", "", regex=False)
    )

    df = emb_df.merge(
        labels_df,
        left_on='ID_clean', right_on='src_subject_id_clean', how='inner'
    )

    df["scan_age"] = df["scan_age"].apply(lambda x: float(x[0][0]) if isinstance(x, list) else x)
    df["prematurity_class"]=df["prem_class"].str.replace("_sem","")
    df = df[df['prematurity_class'].isin([threshold, ">=37"])].copy()
    df['y'] = (df['prematurity_class'] == threshold).astype(int)
    print(df["prem_class"].head())
    print(df["prematurity_class"].head())


    split_data = {}

    if site == "site_id_l":
        site_cols = [f"site{str(k).zfill(2)}" for k in range(1, 22)]
        for col in site_cols :
            subset = df[df[col] == 1]
            X = subset.filter(regex=r'^dim').values
            y = subset['y'].values
            split_data[col] = (X, y)

    else : 
        # Split by unique site values
        site_values = df[site].unique()
        print(f"Unique values for {site}:", site_values)

        split_data = {}
        for val in site_values:
            subset = df[df[site] == val]
            if len(subset) < 10:
                print(f"Skipping {site} = {val} (not enough samples)")
                continue
            X = subset.filter(regex=r'^dim').values
            y = subset['y'].values
            split_data[val] = (X, y)

    return split_data
           



In [85]:
def classify_per_site(region, site, n_jobs_inner=-1):
    split_data = setup_task_per_site(region, site)
    results = {}

    for val, (X, y) in split_data.items():
        print(f"Training for {site} = {val} (n={len(y)})")
        pipe = Pipeline([
            ('scaler', StandardScaler()),
            ('svc', SVC(kernel='linear', probability=True, class_weight='balanced'))
        ])
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        grid = GridSearchCV(
            pipe, {'svc__C': [0.01, 0.1, 1, 10]},
            cv=cv, scoring='roc_auc', n_jobs=n_jobs_inner
        )
        grid.fit(X, y)
        best_C = grid.best_params_['svc__C']
        cv_scores = cross_val_score(grid.best_estimator_, X, y,
                                    cv=cv, scoring='roc_auc', n_jobs=n_jobs_inner)
        results[val] = {
            'auc_mean': cv_scores.mean(),
            'auc_std': cv_scores.std(),
            'best_C': best_C
        }
        print(f"{val} | AUC = {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
    
    return results


In [61]:
classify_per_site(region = "STs_right", site="site_id_l",  n_jobs_inner=-1)

0    >=37_sem
1    >=37_sem
2    >=37_sem
3    >=37_sem
4    >=37_sem
Name: prem_class, dtype: object
0    >=37
1    >=37
2    >=37
3    >=37
4    >=37
Name: prematurity_class, dtype: object
0     [[1.6086177881800783]]
1     [[0.9394940112798978]]
2    [[-0.9340525640406078]]
3     [[0.6718445005198256]]
4      [[-1.20170207480068]]
Name: scan_age, dtype: object
Training for site_id_l = site01 (n=313)
site01 | AUC = 0.733 ± 0.086
Training for site_id_l = site02 (n=337)
site02 | AUC = 0.592 ± 0.090
Training for site_id_l = site03 (n=453)
site03 | AUC = 0.731 ± 0.029
Training for site_id_l = site04 (n=567)
site04 | AUC = 0.770 ± 0.104
Training for site_id_l = site05 (n=297)
site05 | AUC = 0.854 ± 0.116
Training for site_id_l = site06 (n=451)
site06 | AUC = 0.798 ± 0.109
Training for site_id_l = site07 (n=265)
site07 | AUC = 0.628 ± 0.207
Training for site_id_l = site08 (n=227)
site08 | AUC = 0.245 ± 0.205
Training for site_id_l = site09 (n=357)
site09 | AUC = 0.529 ± 0.156
Training for 

{'site01': {'auc_mean': 0.7326593806921677,
  'auc_std': 0.08613678687566997,
  'best_C': 10},
 'site02': {'auc_mean': 0.592339299973383,
  'auc_std': 0.09038897604000617,
  'best_C': 10},
 'site03': {'auc_mean': 0.7309101844426623,
  'auc_std': 0.028724258641989655,
  'best_C': 0.01},
 'site04': {'auc_mean': 0.7695277095277095,
  'auc_std': 0.1043702150541,
  'best_C': 0.01},
 'site05': {'auc_mean': 0.8542649727767696,
  'auc_std': 0.11623019460316525,
  'best_C': 0.01},
 'site06': {'auc_mean': 0.7983585858585859,
  'auc_std': 0.10940346796357349,
  'best_C': 0.01},
 'site07': {'auc_mean': 0.6277526395173454,
  'auc_std': 0.20691848074267516,
  'best_C': 1},
 'site08': {'auc_mean': 0.24500000000000002,
  'auc_std': 0.20484019169155113,
  'best_C': 0.1},
 'site09': {'auc_mean': 0.5293549303779483,
  'auc_std': 0.15589415283133948,
  'best_C': 0.01},
 'site10': {'auc_mean': 0.37489419965148113,
  'auc_std': 0.043528282245644236,
  'best_C': 10},
 'site11': {'auc_mean': 0.490103519668737

## Third approach residualisation regarding the target

In [86]:
from sklearn.linear_model import LinearRegression

def residualize_features(X_train, C_train, X_test, C_test):
    """
    Régressions linéaires univariées (par colonne) des colonnes de X_train 
    sur C_train, et calcul des résidus pour X_train et X_test.
    
    """
    lr = LinearRegression()
    n_feat = X_train.shape[1]
    R_train = np.zeros_like(X_train)
    R_test  = np.zeros_like(X_test)
    
    # Adjust a regression model for each feature in X_train
    for j in range(n_feat):
        lr.fit(C_train, X_train[:, j])
        r2 = lr.score(C_train, X_train[:, j])
        print(f"Dimension {j} : R2 = {r2:.3f}")
        R_train[:, j] = X_train[:, j] - lr.predict(C_train)
        R_test[:,  j] = X_test[:,  j] - lr.predict(C_test)
    
    return R_train, R_test


In [87]:
from sklearn.linear_model import LinearRegression

def classify_with_residualization(region, target_label, confound_cols, n_jobs_inner=-1):
    # Prepare X, y and confounders
    X_full, y_full, df = setup_task(region, target_label) 

    print(df.columns)

    C_full = df[confound_cols].values              # ex: site one-hot, sex, scan_age
    
    # Manual cross-val to residualize in  each fold 
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []
    
    for train_idx, test_idx in cv.split(X_full, y_full):
        print("Processing folds : train_idx:", train_idx, "test_idx:",test_idx)
        X_tr, X_te = X_full[train_idx], X_full[test_idx]
        y_tr, y_te = y_full[train_idx], y_full[test_idx]
        C_tr, C_te = C_full[train_idx],  C_full[test_idx]
        
        # residualize features
        X_tr_res, X_te_res = residualize_features(X_tr, C_tr, X_te, C_te)
        
        # Standardisation
        scaler = StandardScaler().fit(X_tr_res)
        X_tr_res = scaler.transform(X_tr_res)
        X_te_res = scaler.transform(X_te_res)

        pipe = SVC(kernel='linear', class_weight='balanced', probability=True)
        grid = GridSearchCV(
            estimator=pipe,
            param_grid={'C': [0.01, 0.1, 1, 10]},
            cv=5,
            scoring='roc_auc',
            n_jobs=n_jobs_inner
        )
        grid.fit(X_tr_res, y_tr)
        best_model = grid.best_estimator_

        # Eval on test set
        proba = best_model.predict_proba(X_te_res)[:, 1]
        aucs.append(roc_auc_score(y_te, proba))

    # Final results
    mean_auc = np.mean(aucs)
    std_auc  = np.std(aucs)
    print(f"{region} | target={target_label} | AUC résidulé = {mean_auc:.3f} ± {std_auc:.3f}")
    return aucs


In [88]:
C_cols_site = [f"site{str(k).zfill(2)}" for k in range(1, 23)]
C_cols = ["scan_age"]
print(C_cols)

['scan_age']


In [66]:
aucs = classify_with_residualization(
    region="STs_right",
    target_label=None,
    confound_cols=C_cols
)


Index(['ID', 'dim1', 'dim2', 'dim3', 'dim4', 'dim5', 'dim6', 'dim7', 'dim8',
       'dim9',
       ...
       'site17', 'site18', 'site19', 'site20', 'site21', 'site22', 'sex',
       'scan_age', 'prematurity_class', 'y'],
      dtype='object', length=614)
Processing folds : train_idx: [   3    4    5 ... 8733 8735 8736] test_idx: [   0    1    2 ... 8717 8726 8734]
Dimension 0 : R2 = 0.000
Dimension 1 : R2 = 0.000
Dimension 2 : R2 = 0.000
Dimension 3 : R2 = 0.000
Dimension 4 : R2 = 0.000
Dimension 5 : R2 = 0.001
Dimension 6 : R2 = 0.000
Dimension 7 : R2 = 0.000
Dimension 8 : R2 = 0.000
Dimension 9 : R2 = 0.001
Dimension 10 : R2 = 0.000
Dimension 11 : R2 = 0.000
Dimension 12 : R2 = 0.001
Dimension 13 : R2 = 0.000
Dimension 14 : R2 = 0.000
Dimension 15 : R2 = 0.000
Dimension 16 : R2 = 0.000
Dimension 17 : R2 = 0.000
Dimension 18 : R2 = 0.000
Dimension 19 : R2 = 0.000
Dimension 20 : R2 = 0.000
Dimension 21 : R2 = 0.000
Dimension 22 : R2 = 0.000
Dimension 23 : R2 = 0.000
Dimension 24 : R2

In [90]:
def compute_all_scores(region_list, output_csv="all_scores.csv"):
    records = []

    for region in region_list:
        print(f"Processing region: {region}")

        # # AUC per site
        res_site = classify_per_site(region=region, site="site_id_l", n_jobs_inner=-1)
        auc_mean_site = np.mean([v['auc_mean'] for v in res_site.values()])
        auc_std_site = np.std([v['auc_mean'] for v in res_site.values()])
        

        # AUC per sex
        res_sex = classify_per_site(region=region, site="sex", n_jobs_inner=-1)
        print(res_sex)
        auc_prema_sex0 = res_sex[0]["auc_mean"]
        auc_prema_sex1 = res_sex[1]["auc_mean"]

        # Classification sex
        res_clf_sex = classify(region=region, site="sex", n_jobs_inner=-1)
        print(res_clf_sex)
        auc_clf_sex = res_clf_sex['cv_auc_mean']
        std_clf_sex = res_clf_sex['cv_auc_std']

        # residualisation on sex
        aucs_res_sex = classify_with_residualization(
            region=region,
            target_label=None,
            confound_cols=["sex"]
        )
        auc_res_sex = np.mean(aucs_res_sex)
        std_res_sex = np.std(aucs_res_sex)

        # Residualisation on scan_age
        aucs_res_age = classify_with_residualization(
            region=region,
            target_label=None,
            confound_cols=["scan_age"]
        )
        auc_res_age = np.mean(aucs_res_age)
        std_res_age = np.std(aucs_res_age)

        # Residualisation on site
        site_cols = [f"site{str(k).zfill(2)}" for k in range(1, 23)]
        aucs_res_site = classify_with_residualization(
            region=region,
            target_label=None,
            confound_cols=site_cols
        )
        auc_res_site = np.mean(aucs_res_site)
        std_res_site = np.std(aucs_res_site)

        records.append({
            "region": region,
            "auc_site_sex1": auc_mean_site,
            "auc_site_other": auc_std_site,
            "auc_prema_sex0": auc_prema_sex0,
            "auc_prema_sex1": auc_prema_sex1,
            "auc_clf_sex": auc_clf_sex,
            "std_clf_sex": std_clf_sex,
            "auc_res_sex": auc_res_sex,
            "std_res_sex": std_res_sex,
            "auc_res_age": auc_res_age,
            "std_res_age": std_res_age,
            "auc_res_site": auc_res_site,
            "std_res_site": std_res_site,
        })

    df_scores = pd.DataFrame.from_records(records)
    df_scores.to_csv(output_csv, index=False)
    print(f"Saved all scores to {output_csv}")
    return df_scores

regions = ["STs_right", "STi-STs-STpol_right", "FCMpost-SpC_right"]
df_results = compute_all_scores(regions, output_csv="results_AUCs.csv")
print(df_results)


Processing region: STs_right
0    >=37_sem
1    >=37_sem
2    >=37_sem
3    >=37_sem
4    >=37_sem
Name: prem_class, dtype: object
0    >=37
1    >=37
2    >=37
3    >=37
4    >=37
Name: prematurity_class, dtype: object
Training for site_id_l = site01 (n=313)
site01 | AUC = 0.733 ± 0.086
Training for site_id_l = site02 (n=337)
site02 | AUC = 0.592 ± 0.090
Training for site_id_l = site03 (n=453)
site03 | AUC = 0.731 ± 0.029
Training for site_id_l = site04 (n=567)
site04 | AUC = 0.770 ± 0.104
Training for site_id_l = site05 (n=297)
site05 | AUC = 0.854 ± 0.116
Training for site_id_l = site06 (n=451)
site06 | AUC = 0.798 ± 0.109
Training for site_id_l = site07 (n=265)
site07 | AUC = 0.628 ± 0.207
Training for site_id_l = site08 (n=227)
site08 | AUC = 0.245 ± 0.205
Training for site_id_l = site09 (n=357)
site09 | AUC = 0.529 ± 0.156
Training for site_id_l = site10 (n=533)
site10 | AUC = 0.375 ± 0.044
Training for site_id_l = site11 (n=361)
site11 | AUC = 0.490 ± 0.178
Training for site_id_