In [1]:
import os
import csv
import numpy as np
import pandas as pd
import multiprocessing
from time import perf_counter
from tqdm.auto import tqdm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    cross_val_score,
    permutation_test_score,
)
from concurrent.futures import ProcessPoolExecutor, as_completed

In [11]:
# CONFIG
labels_path = "/neurospin/dico/rmenasria/Runs/03_main/Input/ABCD/prematurity_labels_true_classes.csv"
base_path   = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/embeddings/ABCD_embeddings/"
embeddings_finetuning_STs = "/neurospin/dico/rmenasria/Runs/04_finetune/Output/2025-07-23/STs_right/13-33-30_40/finetune_STs_R_random_embeddings_best_model/full_embeddings.csv"
embeddings_finetuning_FCM= "/neurospin/dico/rmenasria/Runs/04_finetune/Output/2025-07-23/FCM_post_SpC_right/15-53-19_135/finetune_FCMpost-SpC_R_random_embeddings_best_model/full_embeddings.csv"
embeddings_finetuning_FCM_00001 = "/neurospin/dico/rmenasria/Runs/04_finetune/Output/2025-07-28/FCM_post_SpC_right/11-16-13_95/finetune_FCMpost-SpC_R_random_embeddings_best_model/full_embeddings.csv"
embeddings_finetuning_STs_00001="/neurospin/dico/rmenasria/Runs/04_finetune/Output/2025-07-28/STs_right/data/finetune_STs_R_random_embeddings_best_model/full_embeddings.csv"
output_csv  = "/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/ABCD_prematurity_results_2008.csv"
thresholds  = ["28-32","32-37"]

In [4]:
labels_df = pd.read_csv(labels_path, low_memory=False)
labels_df['src_subject_id'] = labels_df['src_subject_id'].str.replace("_","")

In [7]:
def classify(region, threshold, n_jobs_inner,finetuned=False):
    print(f"Début traitement : region={region}, seuil={threshold}")
    t0 = perf_counter()

    # Chargement dynamique du fichier d'embedding
    embedding_file = None
    if finetuned:
        if region == "STs_right":
            embedding_file = embeddings_finetuning_STs_00001
        else:
            embedding_file = embeddings_finetuning_FCM_00001
    else:
        for file in os.listdir(base_path):
            if file.startswith(region) and file.endswith(".csv"):
                embedding_file = file
                break
            
    if embedding_file is None:
        raise FileNotFoundError(f"No embedding file found for region: {region}")
    
    print(f"Using embedding file: {embedding_file}")
    
    emb_path = os.path.join(base_path, embedding_file)
    emb_df = pd.read_csv(emb_path)
    emb_df['ID_clean'] = (
        emb_df['ID'].astype(str)
        .str.replace(r"^sub-", "", regex=True)
        .str.replace("_", "", regex=False)
    )

    df = emb_df.merge(
        labels_df,
        left_on='ID_clean', right_on='src_subject_id', how='inner'
    )
    df = df[df['prem_class'].isin([threshold, ">=37"])].copy()
    df['y'] = (df['prem_class'] == threshold).astype(int)
    X = df.filter(regex=r'^dim').values
    y = df['y'].values

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='linear', probability=True, class_weight='balanced'))
    ])
    outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(
        pipe, {'svc__C': [0.01, 0.1, 1, 10]},
        cv=5, scoring='roc_auc', n_jobs=n_jobs_inner
    )
    #grid.fit(X, y)
    cv_scores = cross_val_score(grid, X, y,
                                cv=outer_cv, scoring='roc_auc', n_jobs=n_jobs_inner)
    #best_C = grid.best_params_['svc__C']
    # Probabilities of the best estimator in descending order

    
    # perm_score, perm_scores, pval = permutation_test_score(
    #     grid, X, y,
    #     scoring='roc_auc', cv=cv,
    #     n_permutations=1, n_jobs=n_jobs_inner, random_state=0
    # )
    # ci95 = np.percentile(perm_scores, 95)
    # t1 = perf_counter()

    print(f"{region}–{threshold} → AUC {cv_scores.mean():.3f}")
    #print("Best_score:", grid.best_score_)

    return {
        'region': region,
        'threshold': threshold,
        #'best_C': best_C,
        'cv_auc_mean': cv_scores.mean(),
        'cv_auc_std': cv_scores.std(),
        # 'ci95_null': ci95,
        # 'perm_pvalue': pval,
        # 'duration_min': (t1 - t0) / 60
    }


In [8]:
# test on a single region and threshold
classify(region="FCLp-subsc-FCLa-INSULA_right", threshold="28-32", n_jobs_inner=-1, finetuned=False)



Début traitement : region=FCLp-subsc-FCLa-INSULA_right, seuil=28-32
Using embedding file: FCLp-subsc-FCLa-INSULA_right_name17-47-16--166_embeddings.csv
FCLp-subsc-FCLa-INSULA_right–28-32 → AUC 0.757
Best_score: 0.7568941114938716


{'region': 'FCLp-subsc-FCLa-INSULA_right',
 'threshold': '28-32',
 'best_C': 0.01,
 'cv_auc_mean': 0.7568941114938716,
 'cv_auc_std': 0.022886283068468266}

In [9]:
def get_region_list(model_path):
    return sorted([
        d for d in os.listdir(model_path)
        if os.path.isdir(os.path.join(model_path, d))
           and not d.startswith('all_models')
           and not d.startswith('hcp')
           and not d.startswith('ukb')
           and not d.endswith('.csv')
           and not d.endswith('.sh')
           and not d.endswith('embeddings')
    ])

In [10]:
model_path="/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation"

#region_test = ["STs_right","FCMpost-SpC_right"]
#region_test = ["STs_right"]


region_list=get_region_list(model_path)
region_list.remove("analysis")
region_list.remove("region_list")
region_list.remove("params_OLS")
print(f"Regions found: {region_list}")
print(f"Number of regions: {len(region_list)}")

# missing=[r for r in region_list if r not in emb_cache]
# print(missing)

Regions found: ['CINGULATE_left', 'CINGULATE_right', 'FCLp-subsc-FCLa-INSULA_left', 'FCLp-subsc-FCLa-INSULA_right', 'FCMpost-SpC_left', 'FCMpost-SpC_right', 'FColl-SRh_left', 'FColl-SRh_right', 'FIP_left', 'FIP_right', 'FPO-SCu-ScCal_left', 'FPO-SCu-ScCal_right', 'LARGE_CINGULATE_left', 'LARGE_CINGULATE_right', 'Lobule_parietal_sup_left', 'Lobule_parietal_sup_right', 'OCCIPITAL_left', 'OCCIPITAL_right', 'SC-SPeC_left', 'SC-SPeC_right', 'SC-SPoC_left', 'SC-SPoC_right', 'SC-sylv_left', 'SC-sylv_right', 'SFinf-BROCA-SPeCinf_left', 'SFinf-BROCA-SPeCinf_right', 'SFint-FCMant_left', 'SFint-FCMant_right', 'SFint-SR_left', 'SFint-SR_right', 'SFinter-SFsup_left', 'SFinter-SFsup_right', 'SFmarginal-SFinfant_left', 'SFmarginal-SFinfant_right', 'SFmedian-SFpoltr-SFsup_left', 'SFmedian-SFpoltr-SFsup_right', 'SOr-SOlf_left', 'SOr-SOlf_right', 'SOr_left', 'SOr_right', 'SPeC_left', 'SPeC_right', 'SPoC_left', 'SPoC_right', 'STi-SOTlat_left', 'STi-SOTlat_right', 'STi-STs-STpol_left', 'STi-STs-STpol_righ

In [12]:
# Combinaisons à tester
combos = [(r, t) for r in region_list for t in thresholds]

# --- Gestion checkpoint ---
if os.path.exists(output_csv):
    done_df = pd.read_csv(output_csv)
    done_set = set(zip(done_df['region'], done_df['threshold']))
else:
    done_set = set()

# Filtrer uniquement ce qu'il reste à calculer
pending_combos = [(r, t) for r, t in combos if (r, t) not in done_set]

print(f"Total combos: {len(combos)}, déjà faits: {len(done_set)}, à calculer: {len(pending_combos)}")

# Paramètres de parallélisation
total_cpus   = multiprocessing.cpu_count()
n_jobs_outer = 3
n_jobs_inner = 29

# Ouvre en append si fichier existe, sinon création + header
file_exists = os.path.exists(output_csv)
with open(output_csv, 'a', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=[
        'region','threshold','best_C','cv_auc_mean','cv_auc_std',
        'ci95_null','perm_pvalue','duration_min'
    ])
    if not file_exists:
        writer.writeheader()

    with ProcessPoolExecutor(max_workers=n_jobs_outer) as exe:
        futures = {
            exe.submit(classify, r, t, n_jobs_inner, False): (r, t)
            for r, t in pending_combos
        }
        for future in tqdm(as_completed(futures), total=len(futures), desc="Blocs traités"):
            res = future.result()
            writer.writerow(res)

print("Terminé : résultats checkpoint dans", output_csv)


Total combos: 116, déjà faits: 0, à calculer: 116
Début traitement : region=CINGULATE_right, seuil=28-32Début traitement : region=CINGULATE_left, seuil=32-37Début traitement : region=CINGULATE_left, seuil=28-32


Using embedding file: CINGULATE_left_name17-24-32--191_embeddings.csv

Blocs traités:   0%|          | 0/116 [00:00<?, ?it/s]

Using embedding file: CINGULATE_right_name17-24-32--237_embeddings.csvUsing embedding file: CINGULATE_left_name17-24-32--191_embeddings.csv


CINGULATE_left–28-32 → AUC 0.562
Best_score: 0.5720534660394
Début traitement : region=CINGULATE_right, seuil=32-37

Using embedding file: CINGULATE_right_name17-24-32--237_embeddings.csvCINGULATE_left–32-37 → AUC 0.543
Best_score: 0.5456837538735926
Début traitement : region=FCLp-subsc-FCLa-INSULA_left, seuil=28-32
Using embedding file: FCLp-subsc-FCLa-INSULA_left_name17-43-58--232_embeddings.csv
CINGULATE_right–28-32 → AUC 0.555
Best_score: 0.5693785134594741
Début traitement : region=FCLp-subsc-FCLa-INSULA_left, seuil=32-37
Using embedding file: FCLp-subsc-FCLa-INSULA_left_name17-43-58--232_embeddings.csv
FCLp-subsc-FCLa-INSULA_left–28-32 → AUC 0.716
Best_score: 0.7287711392623792
Début traitement : region=FCLp-subsc-FCLa-INSULA_right, seuil=28-32
Using embedding file: FCLp-subsc-FCLa-INSULA_right_name17-47-16--166_embeddings.csv
CINGULATE_rig



FPO-SCu-ScCal_right–32-37 → AUC 0.595
Best_score: 0.5958167952529579
Début traitement : region=LARGE_CINGULATE_right, seuil=28-32
Using embedding file: LARGE_CINGULATE_right_name07-22-35--179_embeddings.csv




LARGE_CINGULATE_left–28-32 → AUC 0.591
Best_score: 0.5956406425963862
Début traitement : region=LARGE_CINGULATE_right, seuil=32-37
Using embedding file: LARGE_CINGULATE_right_name07-22-35--179_embeddings.csv
LARGE_CINGULATE_right–28-32 → AUC 0.586
Best_score: 0.5998094251312812
Début traitement : region=Lobule_parietal_sup_left, seuil=28-32
Using embedding file: Lobule_parietal_sup_left_name07-23-04--36_embeddings.csv
LARGE_CINGULATE_left–32-37 → AUC 0.565
Best_score: 0.565807622713278
Début traitement : region=Lobule_parietal_sup_left, seuil=32-37
Using embedding file: Lobule_parietal_sup_left_name07-23-04--36_embeddings.csv
LARGE_CINGULATE_right–32-37 → AUC 0.570
Best_score: 0.570839610485353
Début traitement : region=Lobule_parietal_sup_right, seuil=28-32
Using embedding file: Lobule_parietal_sup_right_name07-24-01--193_embeddings.csv
Lobule_parietal_sup_left–28-32 → AUC 0.538
Best_score: 0.552190398489008
Début traitement : region=Lobule_parietal_sup_right, seuil=32-37
Using embedd



SC-SPoC_right–28-32 → AUC 0.529
Best_score: 0.5424270735339294
Début traitement : region=SC-sylv_left, seuil=32-37
Using embedding file: SC-sylv_left_name07-58-00--111_embeddings.csv
SC-SPoC_right–32-37 → AUC 0.538
Best_score: 0.5397311051233912
Début traitement : region=SC-sylv_right, seuil=28-32
Using embedding file: SC-sylv_right_name06-17-02--84_embeddings.csv
SC-sylv_left–28-32 → AUC 0.537
Best_score: 0.5556174567055219
Début traitement : region=SC-sylv_right, seuil=32-37
Using embedding file: SC-sylv_right_name06-17-02--84_embeddings.csv
SC-sylv_left–32-37 → AUC 0.558
Best_score: 0.5598619074020487
Début traitement : region=SFinf-BROCA-SPeCinf_left, seuil=28-32
Using embedding file: SFinf-BROCA-SPeCinf_left_name08-00-45--128_embeddings.csv
SFinf-BROCA-SPeCinf_left–28-32 → AUC 0.650
Best_score: 0.6518203406580078
Début traitement : region=SFinf-BROCA-SPeCinf_left, seuil=32-37
Using embedding file: SFinf-BROCA-SPeCinf_left_name08-00-45--128_embeddings.csv
SC-sylv_right–28-32 → AUC 



SFint-SR_left–32-37 → AUC 0.545
Best_score: 0.5455656537277738
Début traitement : region=SFinter-SFsup_left, seuil=28-32
Using embedding file: SFinter-SFsup_left_name08-06-01--220_embeddings.csv
SFint-SR_right–28-32 → AUC 0.611
Best_score: 0.6112419965685056
Début traitement : region=SFinter-SFsup_left, seuil=32-37
Using embedding file: SFinter-SFsup_left_name08-06-01--220_embeddings.csv
SFinter-SFsup_left–28-32 → AUC 0.537
Best_score: 0.5504237647099903
Début traitement : region=SFinter-SFsup_right, seuil=28-32
Using embedding file: SFinter-SFsup_right_name08-08-42--126_embeddings.csv
SFinter-SFsup_left–32-37 → AUC 0.552
Best_score: 0.5549725842583226
Début traitement : region=SFinter-SFsup_right, seuil=32-37
Using embedding file: SFinter-SFsup_right_name08-08-42--126_embeddings.csv




SFint-SR_right–32-37 → AUC 0.559
Best_score: 0.5605150149003038
Début traitement : region=SFmarginal-SFinfant_left, seuil=28-32
Using embedding file: SFmarginal-SFinfant_left_name08-15-17--25_embeddings.csv
SFinter-SFsup_right–32-37 → AUC 0.553
Best_score: 0.5531710617496371
Début traitement : region=SFmarginal-SFinfant_left, seuil=32-37
Using embedding file: SFmarginal-SFinfant_left_name08-15-17--25_embeddings.csv
SFmarginal-SFinfant_left–28-32 → AUC 0.570
Best_score: 0.5878826286546538
Début traitement : region=SFmarginal-SFinfant_right, seuil=28-32
Using embedding file: SFmarginal-SFinfant_right_name08-17-15--135_embeddings.csv
SFinter-SFsup_right–28-32 → AUC 0.620
Best_score: 0.6204928209065721
Début traitement : region=SFmarginal-SFinfant_right, seuil=32-37
Using embedding file: SFmarginal-SFinfant_right_name08-17-15--135_embeddings.csv
SFmarginal-SFinfant_right–28-32 → AUC 0.632
Best_score: 0.6323534666883688
Début traitement : region=SFmedian-SFpoltr-SFsup_left, seuil=28-32
Usin



SOr-SOlf_left–32-37 → AUC 0.634
Best_score: 0.634523011358356
Début traitement : region=SOr_left, seuil=28-32
Using embedding file: SOr_left_name14-12-56--162_embeddings.csv
SOr-SOlf_right–28-32 → AUC 0.657
Best_score: 0.6626339906208909
Début traitement : region=SOr_left, seuil=32-37
Using embedding file: SOr_left_name14-12-56--162_embeddings.csv
SOr-SOlf_right–32-37 → AUC 0.638
Best_score: 0.638720285775013
Début traitement : region=SOr_right, seuil=28-32
Using embedding file: SOr_right_name14-12-56--58_embeddings.csv
SOr_left–28-32 → AUC 0.668
Best_score: 0.6772536560277931
Début traitement : region=SOr_right, seuil=32-37
Using embedding file: SOr_right_name14-12-56--58_embeddings.csv
SOr_left–32-37 → AUC 0.635
Best_score: 0.6350766854502108
Début traitement : region=SPeC_left, seuil=28-32
Using embedding file: SPeC_left_name08-24-23--109_embeddings.csv
SOr_right–28-32 → AUC 0.634
Best_score: 0.6547175412878092
Début traitement : region=SPeC_left, seuil=32-37
Using embedding file: S

In [8]:
# Combinaisons à tester
combos = [(r, t) for r in region_list for t in thresholds]

# Initialisation du CSV de checkpoint
with open(output_csv, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=[
        'region','threshold','best_C','cv_auc_mean','cv_auc_std',
        'ci95_null','perm_pvalue','duration_min'
    ])
    writer.writeheader()

# Calcul du parallélisme
total_cpus   = multiprocessing.cpu_count()
n_jobs_outer = 3
n_jobs_inner = 29
print(f"Parallelisation : outer={n_jobs_outer}, inner={n_jobs_inner}")

Parallelisation : outer=4, inner=24


In [None]:
with ProcessPoolExecutor(max_workers=n_jobs_outer) as exe:
    futures = {
        exe.submit(classify, r, t, n_jobs_inner,False): (r, t)
        for r, t in combos
    }
    for future in tqdm(as_completed(futures), total=len(futures), desc="Blocs traités"):
        res = future.result()
        with open(output_csv, 'a') as f:
            writer = csv.DictWriter(f, fieldnames=res.keys())
            writer.writerow(res)

print("Terminé : résultats checkpoint dans", output_csv)
        

Début traitement : region=CINGULATE_right, seuil=28-32Début traitement : region=CINGULATE_right, seuil=32-37Début traitement : region=CINGULATE_left, seuil=28-32

Blocs traités:   0%|          | 0/116 [00:00<?, ?it/s]

Début traitement : region=CINGULATE_left, seuil=32-37



Using embedding file: CINGULATE_left_name17-24-32--191_embeddings.csvUsing embedding file: CINGULATE_right_name17-24-32--237_embeddings.csvUsing embedding file: CINGULATE_right_name17-24-32--237_embeddings.csvUsing embedding file: CINGULATE_left_name17-24-32--191_embeddings.csv



CINGULATE_right–32-37 → AUC 0.544
Best_score: 0.5441659630524858
Début traitement : region=FCLp-subsc-FCLa-INSULA_left, seuil=28-32
Using embedding file: FCLp-subsc-FCLa-INSULA_left_name17-43-58--232_embeddings.csv
CINGULATE_left–32-37 → AUC 0.546
Best_score: 0.5456837538735926
Début traitement : region=FCLp-subsc-FCLa-INSULA_left, seuil=32-37
Using embedding file: FCLp-subsc-FCLa-INSULA_left_name17-43-58--232_embeddings.csv
CINGULATE_left–28-32 → AUC 0.572
Best_score: 0.5720534660394
Début traitement : region=FCLp-subsc-FCLa-INSULA_right, seuil=28-32
Using embedding file: FCLp-subsc-FCLa-INSULA_right_name17-47-16--166_embeddings.csv
FCLp-subsc-FCLa-INS

In [9]:
# Charger le fichier CSV d'origine
df = pd.read_csv('/neurospin/dico/rmenasria/Runs/03_main/Output/csv/ABCD_prematurity_results_0708.csv')

# Remplacer les AUC_mean non significatifs (p_value >= 0.05) par 0
df.loc[df['perm_pvalue'] >= 0.005, 'cv_auc_mean'] = 0.0

# Pivot pour avoir une colonne AUC par tranche
df_pivot = df.pivot(index='region', columns='threshold', values='cv_auc_mean')

# Renommer les colonnes
df_pivot = df_pivot.rename(columns={
    '28-32': 'AUC_28_32',
    '32-37': 'AUC_37'
}).reset_index()

# Exporter le résultat
output_csv = '/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/prematurity_AUC_by_region_ABCD_0708_thresholded.csv'
df_pivot.to_csv(output_csv, index=False)
print(f"Fichier généré : {output_csv}")

Fichier généré : /neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/prematurity_AUC_by_region_ABCD_0708_thresholded.csv


In [10]:
df = pd.read_csv("/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/ABCD_prematurity_results.csv")

# Pivot pour avoir une colonne AUC par tranche
df_pivot = df.pivot(index='region', columns='threshold', values='cv_auc_mean')

# Renommer les colonnes pour clarifier
df_pivot = df_pivot.rename(columns={
    '<27': 'AUC_27',
    '27-32': 'AUC_27_32',
    '32-37': 'AUC_32_37'
}).reset_index()

# Sauvegarder le nouveau CSV
output_csv = '/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/prematurity_AUC_by_region_ABCD_2406.csv'
df_pivot.to_csv(output_csv, index=False)
print(f"Fichier généré : {output_csv}")



Fichier généré : /neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/prematurity_AUC_by_region_ABCD_2406.csv
