In [5]:
import os
import csv
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_predict
)

In [6]:
# CONFIG
labels_path = "/neurospin/dico/rmenasria/Runs/03_main/Input/ABCD/prematurity_labels_true_classes.csv"
base_path   = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/embeddings/ABCD_embeddings/"
output_csv  = "/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/ABCD_prematurity_results_pvalues.csv"
thresholds  = ["<28","28-32","32-37"]

In [7]:
labels_df = pd.read_csv(labels_path, low_memory=False)
labels_df['src_subject_id'] = labels_df['src_subject_id'].str.replace("_","")

In [8]:
def compute_confidences_fixedC(region, threshold, base_path, labels_df,
                               best_C, n_splits=5, n_jobs=1):
    # Load and prepare the data
    emb_file = next(f for f in os.listdir(base_path)
                    if f.startswith(region) and f.endswith(".csv"))
    emb_df = pd.read_csv(os.path.join(base_path, emb_file))
    emb_df['ID_clean'] = (emb_df['ID']
                          .str.replace(r"^sub-", "", regex=True)
                          .str.replace("_","",    regex=False))
    df = emb_df.merge(labels_df, left_on='ID_clean', right_on='src_subject_id', how='inner')
    df = df[df['prem_class'].isin([threshold, ">=37"])].copy()
    df['y'] = (df['prem_class'] == threshold).astype(int)

    X   = df.filter(regex=r'^dim').values
    y   = df['y'].values
    ids = df['src_subject_id'].values

    # Create the pipeline with the best C (known from previous tuning)
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('svc',   SVC(kernel='linear', C=best_C, probability=True, class_weight='balanced'))
    ])

    # Get the probabilities using cross-validation
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    proba_pos = cross_val_predict(
        pipe, X, y,
        cv=cv,
        method='predict_proba',
        n_jobs=n_jobs
    )[:, 1]

    # Prepare the output DataFrame
    out_df = pd.DataFrame({
        'src_subject_id':  ids,
        'proba_premature': proba_pos
    }).sort_values('proba_premature', ascending=False)
    out_df['region']    = region
    out_df['threshold'] = threshold
    return out_df



In [None]:
#region = "FCLp-subsc-FCLa-INSULA_right"
#region = "FCMpost-SpC_right"
#region = "STi-STs-STpol_right"
#region = "STs_right"
#region = "SFinf-BROCA-SPeCinf_left"
#region = "FColl-SRh_right"
region = "STi-SOTlat_right"


threshold = thresholds[1] 


df_conf = compute_confidences_fixedC(
    region=region,
    threshold=threshold,
    base_path=base_path,
    labels_df=labels_df,
    best_C=10,       
    n_jobs=-1
)
output_csv = f"/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/{region}_{threshold}_new_confidences.csv"
df_conf.to_csv(output_csv, index=False)
print("Done:", output_csv)


Done: /neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/STi-SOTlat_right_28-32_new_confidences.csv


: 