In [10]:
import os
import csv
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_predict
)

In [11]:
# CONFIG
labels_path = "/neurospin/dico/data/deep_folding/current/datasets/dHCP_374_subjects/participants.csv"
base_path = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation"
# Stratification and brackets
tranches = {
    '<28': {'mask': (labels < 28)},
    '28-32': {'mask': (labels >= 28) & (labels < 32)},
    '32-37': {'mask': (labels >= 32) & (labels < 37)}
}

In [12]:
labels_df = pd.read_csv(labels_path)
labels=labels_df["birth_age"]
print("labels", labels[:15])

labels 0     40.428571
1     40.000000
2     40.428571
3     39.857143
4     40.714286
5     39.714286
6     38.571429
7     39.285714
8     36.571429
9     39.428571
10    40.571429
11    35.714286
12    40.285714
13    41.285714
14    39.571429
Name: birth_age, dtype: float64


In [15]:
def compute_confidences_fixedC(region, threshold, base_path, labels_df,
                               best_C, n_splits=5, n_jobs=1):



    region_path = os.path.join(base_path, region)
    
    # Trouver le nom du dossier unique automatiquement
    subdirs = [d for d in os.listdir(region_path) if os.path.isdir(os.path.join(region_path, d))]
    if len(subdirs) != 1:
        raise RuntimeError(f"Il y a {len(subdirs)} sous-dossiers dans {region_path}, impossible de choisir automatiquement car il y a plusieurs modèles enregistrés par région.")
    model_folder = subdirs[0]

    # Construire les chemins complets
    embedding_path = os.path.join(region_path, model_folder, "dHCP_random_embeddings", "full_embeddings.csv")
    embeddings = pd.read_csv(embedding_path, index_col=0)
    embeddings= embeddings.loc[embeddings.index.isin(labels_df['Subject'])]
    print("embeddings shape",embeddings.shape)
    print("embeddings columns",embeddings.columns)
    
    name = threshold
    info = tranches[name]
    mask_pre = info['mask']
    mask_term = (labels >= 37)
    mask = mask_pre | mask_term
    X= embeddings[mask.values]
    y= np.where(mask_pre[mask], 1, 0)


    # Create the pipeline with the best C (known from previous tuning)
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('svc',   SVC(kernel='linear', C=best_C, probability=True, class_weight='balanced'))
    ])

    # Get the probabilities using cross-validation
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    proba_pos = cross_val_predict(
        pipe, X, y,
        cv=cv,
        method='predict_proba',
        n_jobs=n_jobs
    )[:, 1]

    # Prepare the output DataFrame
    out_df = pd.DataFrame({
        'Subject':  embeddings.index[mask.values],
        'proba_premature': proba_pos
    }).sort_values('proba_premature', ascending=False)
    out_df['region']    = region
    out_df['threshold'] = threshold
    return out_df


In [17]:
region = "FCMpost-SpC_right"
threshold = "28-32"


df_conf = compute_confidences_fixedC(
    region=region,
    threshold=threshold,
    base_path=base_path,
    labels_df=labels_df,
    best_C=0.1,       
    n_jobs=-1
)
output_csv = f"/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/dHCP_{region}_{threshold}_confidences.csv"
df_conf.to_csv(output_csv, index=False)
print("Done:", output_csv)


embeddings shape (374, 32)
embeddings columns Index(['dim1', 'dim2', 'dim3', 'dim4', 'dim5', 'dim6', 'dim7', 'dim8', 'dim9',
       'dim10', 'dim11', 'dim12', 'dim13', 'dim14', 'dim15', 'dim16', 'dim17',
       'dim18', 'dim19', 'dim20', 'dim21', 'dim22', 'dim23', 'dim24', 'dim25',
       'dim26', 'dim27', 'dim28', 'dim29', 'dim30', 'dim31', 'dim32'],
      dtype='object')


Done: /neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/dHCP_STs_right_<28_confidences.csv
