In [1]:
import os
import csv
import numpy as np
import pandas as pd
import multiprocessing
from time import perf_counter
from tqdm.auto import tqdm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    StratifiedKFold,
    GridSearchCV,
    cross_val_score,
    permutation_test_score,
)
from concurrent.futures import ProcessPoolExecutor, as_completed

In [2]:
# CONFIG
labels_path = "/neurospin/dico/rmenasria/Runs/03_main/Input/ABCD/ABCD_preterm_labels_light.csv"
base_path   = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/embeddings/ABCD_embeddings/"
output_csv  = "/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/ABCD_prematurity_results.csv"
thresholds  = ["<27","27-32","32-37"]

In [3]:
labels_df = pd.read_csv(labels_path, low_memory=False)
labels_df['src_subject_id'] = labels_df['src_subject_id'].str.replace("_","")

In [4]:
def classify(region, threshold, n_jobs_inner):
    print(f"Début traitement : region={region}, seuil={threshold}")
    t0 = perf_counter()

    # Chargement dynamique du fichier d'embedding
    embedding_file = None
    for file in os.listdir(base_path):
        if file.startswith(region) and file.endswith(".csv"):
            embedding_file = file
            break
    if embedding_file is None:
        raise FileNotFoundError(f"No embedding file found for region: {region}")
    
    emb_path = os.path.join(base_path, embedding_file)
    emb_df = pd.read_csv(emb_path)
    emb_df['ID_clean'] = (
        emb_df['ID'].astype(str)
        .str.replace(r"^sub-", "", regex=True)
        .str.replace("_", "", regex=False)
    )

    df = emb_df.merge(
        labels_df,
        left_on='ID_clean', right_on='src_subject_id', how='inner'
    )
    df = df[df['prem_class'].isin([threshold, ">=37"])].copy()
    df['y'] = (df['prem_class'] == threshold).astype(int)
    X = df.filter(regex=r'^dim').values
    y = df['y'].values

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('svc', SVC(kernel='linear', probability=True, class_weight='balanced'))
    ])
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    grid = GridSearchCV(
        pipe, {'svc__C': [0.01, 0.1, 1, 10]},
        cv=cv, scoring='roc_auc', n_jobs=n_jobs_inner
    )
    grid.fit(X, y)
    best_C = grid.best_params_['svc__C']
    cv_scores = cross_val_score(grid.best_estimator_, X, y,
                                cv=cv, scoring='roc_auc', n_jobs=n_jobs_inner)
    perm_score, perm_scores, pval = permutation_test_score(
        grid.best_estimator_, X, y,
        scoring='roc_auc', cv=cv,
        n_permutations=1, n_jobs=n_jobs_inner, random_state=0
    )
    ci95 = np.percentile(perm_scores, 95)
    t1 = perf_counter()
    print(f"{region}–{threshold} → AUC {cv_scores.mean():.3f}")

    return {
        'region': region,
        'threshold': threshold,
        'best_C': best_C,
        'cv_auc_mean': cv_scores.mean(),
        'cv_auc_std': cv_scores.std(),
        'ci95_null': ci95,
        'perm_pvalue': pval,
        'duration_min': (t1 - t0) / 60
    }


In [6]:
def get_region_list(model_path):
    return sorted([
        d for d in os.listdir(model_path)
        if os.path.isdir(os.path.join(model_path, d))
           and not d.startswith('all_models')
           and not d.startswith('hcp')
           and not d.startswith('ukb')
           and not d.endswith('.csv')
           and not d.endswith('.sh')
           and not d.endswith('embeddings')
    ])

In [7]:
model_path="/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation"

region_list=get_region_list(model_path)
region_list.remove("analysis")
region_list.remove("region_list")
region_list.remove("params_OLS")
print(f"Regions found: {region_list}")
print(f"Number of regions: {len(region_list)}")

# missing=[r for r in region_list if r not in emb_cache]
# print(missing)

Regions found: ['CINGULATE_left', 'CINGULATE_right', 'FCLp-subsc-FCLa-INSULA_left', 'FCLp-subsc-FCLa-INSULA_right', 'FCMpost-SpC_left', 'FCMpost-SpC_right', 'FColl-SRh_left', 'FColl-SRh_right', 'FIP_left', 'FIP_right', 'FPO-SCu-ScCal_left', 'FPO-SCu-ScCal_right', 'LARGE_CINGULATE_left', 'LARGE_CINGULATE_right', 'Lobule_parietal_sup_left', 'Lobule_parietal_sup_right', 'OCCIPITAL_left', 'OCCIPITAL_right', 'SC-SPeC_left', 'SC-SPeC_right', 'SC-SPoC_left', 'SC-SPoC_right', 'SC-sylv_left', 'SC-sylv_right', 'SFinf-BROCA-SPeCinf_left', 'SFinf-BROCA-SPeCinf_right', 'SFint-FCMant_left', 'SFint-FCMant_right', 'SFint-SR_left', 'SFint-SR_right', 'SFinter-SFsup_left', 'SFinter-SFsup_right', 'SFmarginal-SFinfant_left', 'SFmarginal-SFinfant_right', 'SFmedian-SFpoltr-SFsup_left', 'SFmedian-SFpoltr-SFsup_right', 'SOr-SOlf_left', 'SOr-SOlf_right', 'SOr_left', 'SOr_right', 'SPeC_left', 'SPeC_right', 'SPoC_left', 'SPoC_right', 'STi-SOTlat_left', 'STi-SOTlat_right', 'STi-STs-STpol_left', 'STi-STs-STpol_righ

In [8]:
# Combinaisons à tester
combos = [(r, t) for r in region_list for t in thresholds]

# Initialisation du CSV de checkpoint
with open(output_csv, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=[
        'region','threshold','best_C','cv_auc_mean','cv_auc_std',
        'ci95_null','perm_pvalue','duration_min'
    ])
    writer.writeheader()

# Calcul du parallélisme
total_cpus   = multiprocessing.cpu_count()
n_jobs_outer = max(1, total_cpus // 4)
n_jobs_inner = max(1, total_cpus // n_jobs_outer)
print(f"Parallelisation : outer={n_jobs_outer}, inner={n_jobs_inner}")

Parallelisation : outer=24, inner=4


In [9]:
with ProcessPoolExecutor(max_workers=n_jobs_outer) as exe:
    futures = {
        exe.submit(classify, r, t, n_jobs_inner): (r, t)
        for r, t in combos
    }
    for future in tqdm(as_completed(futures), total=len(futures), desc="Blocs traités"):
        res = future.result()
        with open(output_csv, 'a') as f:
            writer = csv.DictWriter(f, fieldnames=res.keys())
            writer.writerow(res)

print("Terminé : résultats checkpoint dans", output_csv)

        
    

Blocs traités:   0%|          | 0/1 [00:00<?, ?it/s]

Début traitement : region=LARGE_CINGULATE_left, seuil=27-32
LARGE_CINGULATE_left–27-32 → AUC 0.592
Terminé : résultats checkpoint dans /neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/ABCD_prematurity_results.csv


