In [9]:
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from sklearn.model_selection import (
    KFold,
    GridSearchCV,
    cross_val_score,
    cross_val_predict
)

In [10]:
def get_region_list(base_path):
    return sorted([
        d for d in os.listdir(base_path)
        if os.path.isdir(os.path.join(base_path, d))
           and not d.startswith('all_models')
           and not d.startswith('hcp')
           and not d.startswith('ukb')
           and not d.endswith('.csv')
           and not d.endswith('.sh')
           and not d.endswith('embeddings')
    ])


In [12]:
# CONFIG
labels_path = "/neurospin/dico/rmenasria/Runs/03_main/Input/cognition/nc_y_nihtb.csv"
base_path   = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/embeddings/ABCD_embeddings/"

scores = [
 'nihtbx_flanker_agecorrected',
 'nihtbx_cardsort_agecorrected',
 'nihtbx_list_agecorrected',
 'nihtbx_pattern_agecorrected',
 'nihtbx_picvocab_agecorrected',
 'nihtbx_reading_agecorrected',
 'nihtbx_fluidcomp_agecorrected',
 'nihtbx_cryst_agecorrected',
 'nihtbx_totalcomp_agecorrected'
]

labels_df = pd.read_csv(labels_path)
print(labels_df.shape)
labels_df = labels_df[labels_df["eventname"] == "baseline_year_1_arm_1"]
print(labels_df.shape)

existing_scores = [s for s in scores if s in labels_df.columns]
cols = ["src_subject_id"] + existing_scores
labels_df = labels_df[cols]
print(labels_df.index)

regions = get_region_list(os.path.join(base_path,"../.."))
regions.remove("params_OLS")
regions.remove("region_list")
regions.remove("analysis")
#regions = ["STs_right","FCLp-subsc-FCLa-INSULA_right","FCMpost-SpC_right","STi-STs-STpol_right"]
#region = regions[0]

labels_df['src_subject_id'] = (
    labels_df['src_subject_id'].astype(str)
    .str.replace(r"^sub-", "", regex=True)
    .str.replace("_", "", regex=False)
)

df_ages = pd.read_csv("/neurospin/dico/rmenasria/Runs/03_main/Input/ABCD/prematurity_labels_true_classes.csv")

  labels_df = pd.read_csv(labels_path)


(27028, 120)
(11727, 120)
Index([    0,     1,     3,     5,     7,    10,    13,    15,    18,    20,
       ...
       27001, 27003, 27006, 27008, 27010, 27012, 27015, 27018, 27021, 27024],
      dtype='int64', length=11727)


In [13]:
def load_embeddings(region):
    
    for file in os.listdir(base_path):
            if file.startswith(region) and file.endswith(".csv"):
                embedding_file = file
                break
            
    if embedding_file is None:
        raise FileNotFoundError(f"No embedding file found for region: {region}")

    print(f"Using embedding file: {embedding_file}")

    emb_path = os.path.join(base_path, embedding_file)
    emb_df = pd.read_csv(emb_path)
    emb_df['ID_clean'] = (
        emb_df['ID'].astype(str)
        .str.replace(r"^sub-", "", regex=True)
        .str.replace("_", "", regex=False)
    )
    return emb_df

In [14]:
def preprocess_data(df,prem_class,score):
    
    df_with_cognition = df.merge(
        labels_df,
        left_on='ID_clean', right_on='src_subject_id', how='inner'
    )
    df_with_cognition_and_ages= df_with_cognition.merge(df_ages, left_on='src_subject_id', right_on='src_subject_id', how='inner')
    print(df.columns)
    df_with_cognition_and_ages= df_with_cognition_and_ages[df_with_cognition_and_ages["prem_class"].isin(prem_class)]
    print("test",df_with_cognition_and_ages.shape)

    df_with_cognition_and_ages['y'] = df_with_cognition_and_ages[score]
    df_with_cognition_and_ages.dropna(subset=[score], inplace=True)

    X = df_with_cognition_and_ages.filter(regex=r'^dim').values
    y = df_with_cognition_and_ages['y'].values
    print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

    return X, y

In [21]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from scipy.stats import pearsonr

def regress_on_cognition_bootstrap(score, region, prem_class, n_boot=750, random_state=42):
    rng = np.random.default_rng(random_state)

    emb_df = load_embeddings(region)
    X, y = preprocess_data(emb_df, prem_class, score)
    n = len(y)

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', Ridge())
    ])

    grid = GridSearchCV(
        pipe, {'regressor__alpha': [0.01, 0.1, 1, 10]},
        cv=5, scoring='r2', n_jobs=-1
    )

    r2_list = []

    for i in range(n_boot):
        # Tirage bootstrap avec remise
        idx_boot = rng.integers(0, n, size=n)
        X_boot, y_boot = X[idx_boot], y[idx_boot]

        # Indices OOB
        oob_mask = np.ones(n, dtype=bool)
        oob_mask[np.unique(idx_boot)] = False
        oob_idx = np.where(oob_mask)[0]

        # Fit modèle (avec GridSearch)
        grid.fit(X_boot, y_boot)

        if len(oob_idx) > 0:
            # Évalue sur OOB
            y_oob_pred = grid.predict(X[oob_idx])
            r2 = r2_score(y[oob_idx], y_oob_pred)

        else:
            # Rare: pas d'OOB, fallback = NaN
            r2 = np.nan

        r2_list.append(r2)

    r2_arr = np.array(r2_list)

    results = {
        "region": region,
        "score": score,
        "n": n,
        "r2_mean": np.nanmean(r2_arr),
        "r2_ci": np.nanpercentile(r2_arr, [2.5, 97.5]),
    }

    print(f"Bootstrap results for {region}, {score}:")
    print("R² mean:", results["r2_mean"], "95% CI:", results["r2_ci"])

    return results


In [25]:
print(len(regions))
results = []
out_path = "r2boostrap_prema_all_regions.csv"
for region in regions:
    for score in scores:
        result = regress_on_cognition_bootstrap(score, region,["28-32","32-37"])
        results.append(result)

results_df = pd.DataFrame(results)
results_df.to_csv(out_path, index=False)


results_terms= []
out_path = "r2boostrap_terms_all_regions.csv"
for region in regions:
    for score in scores:
        result = regress_on_cognition_bootstrap(score, region,[">=37"])
        results_terms.append(result)

results_df_terms = pd.DataFrame(results_terms)
results_df_terms.to_csv(out_path, index=False)


results_extr_prem= []
out_path = "r2boostrap_extrprema_all_regions.csv"
for region in regions:
    for score in scores:
        result = regress_on_cognition_bootstrap(score, region,["<28"])
        results_extr_prem.append(result)

results_df_extr_prem = pd.DataFrame(results_extr_prem)
results_df_extr_prem.to_csv(out_path, index=False)



Using embedding file: STs_right_name08-32-58--52_embeddings.csv
Index(['ID', 'dim1', 'dim2', 'dim3', 'dim4', 'dim5', 'dim6', 'dim7', 'dim8',
       'dim9', 'dim10', 'dim11', 'dim12', 'dim13', 'dim14', 'dim15', 'dim16',
       'dim17', 'dim18', 'dim19', 'dim20', 'dim21', 'dim22', 'dim23', 'dim24',
       'dim25', 'dim26', 'dim27', 'dim28', 'dim29', 'dim30', 'dim31', 'dim32',
       'ID_clean'],
      dtype='object')
test (8532, 48)
Shape of X: (8521, 32), Shape of y: (8521,)
Bootstrap results for STs_right, nihtbx_flanker_agecorrected:
R² mean: -0.0009526571670887441 95% CI: [-0.00852363  0.00521795]
{'region': 'STs_right', 'score': 'nihtbx_flanker_agecorrected', 'n': 8521, 'r2_mean': -0.0009526571670887441, 'r2_ci': array([-0.00852363,  0.00521795])}
Using embedding file: STs_right_name08-32-58--52_embeddings.csv
Index(['ID', 'dim1', 'dim2', 'dim3', 'dim4', 'dim5', 'dim6', 'dim7', 'dim8',
       'dim9', 'dim10', 'dim11', 'dim12', 'dim13', 'dim14', 'dim15', 'dim16',
       'dim17', 'dim