In [55]:
import os
import csv
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from sklearn.model_selection import (
    KFold,
    GridSearchCV,
    cross_val_score,
    cross_val_predict
)

In [56]:
def get_region_list(base_path):
    return sorted([
        d for d in os.listdir(base_path)
        if os.path.isdir(os.path.join(base_path, d))
           and not d.startswith('all_models')
           and not d.startswith('hcp')
           and not d.startswith('ukb')
           and not d.endswith('.csv')
           and not d.endswith('.sh')
           and not d.endswith('embeddings')
    ])

In [62]:
# CONFIG
labels_path = "/neurospin/dico/rmenasria/Runs/03_main/Input/csv_cognition_abcd/nc_y_nihtb.csv"
base_path   = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/embeddings/ABCD_embeddings/"

scores = [
 'nihtbx_flanker_agecorrected',
 'nihtbx_cardsort_agecorrected',
 'nihtbx_list_agecorrected',
 'nihtbx_pattern_agecorrected',
 'nihtbx_picvocab_agecorrected',
 'nihtbx_reading_agecorrected',
 'nihtbx_fluidcomp_agecorrected',
 'nihtbx_cryst_agecorrected',
 'nihtbx_totalcomp_agecorrected'
]

labels_df = pd.read_csv(labels_path)
print(labels_df.shape)
labels_df = labels_df[labels_df["eventname"] == "baseline_year_1_arm_1"]
print(labels_df.shape)

existing_scores = [s for s in scores if s in labels_df.columns]
cols = ["src_subject_id"] + existing_scores
labels_df = labels_df[cols]
print(labels_df.index)

regions = get_region_list(os.path.join(base_path,"../.."))
regions.remove("params_OLS")
regions.remove("region_list")
regions.remove("analysis")
#regions = ["STs_right","FCLp-subsc-FCLa-INSULA_right","FCMpost-SpC_right","STi-STs-STpol_right"]
#region = regions[0]

labels_df['src_subject_id'] = (
    labels_df['src_subject_id'].astype(str)
    .str.replace(r"^sub-", "", regex=True)
    .str.replace("_", "", regex=False)
)

df_ages = pd.read_csv("/neurospin/dico/rmenasria/Runs/03_main/Input/ABCD/prematurity_labels_true_classes.csv")
df_ages = df_ages[df_ages["prem_class"].isin(["<28","28-32", "32-37", ">=37"])]


(27028, 120)
(11727, 120)
Index([    0,     1,     3,     5,     7,    10,    13,    15,    18,    20,
       ...
       27001, 27003, 27006, 27008, 27010, 27012, 27015, 27018, 27021, 27024],
      dtype='int64', length=11727)


  labels_df = pd.read_csv(labels_path)


In [63]:
def preprocess_data(df):
    
    df_with_cognition_and_ages= df.merge(
        labels_df,
        left_on='src_subject_id', right_on='src_subject_id', how='inner'
    )

    return df_with_cognition_and_ages

df = preprocess_data(df_ages)
df.to_csv(
    "df_with_cognition_and_ages.csv",
    index=False
)

In [None]:
from scipy.stats import pearsonr, ttest_ind

# CONFIG
data_csv = "/neurospin/dico/rmenasria/Runs/03_main/Program/2025_rmenasria_prematurity/notebooks/racim/df_with_cognition_and_ages.csv"  
ga_col = "gest_age"        
prem_binary_col = "devhx_12a_p"  # 1 = préma, 0 = terme
out_csv = "prematurity_stats_simple.csv"

df = pd.read_csv(data_csv)
df[prem_binary_col] = df[prem_binary_col].astype(int)
print("Loaded:", df.shape)

#Cohen's d
def cohen_d(x, y):
    nx, ny = len(x), len(y)
    if nx < 2 or ny < 2:
        return np.nan
    mx, my = np.mean(x), np.mean(y)
    sx2, sy2 = np.var(x, ddof=1), np.var(y, ddof=1)
    pooled = np.sqrt(((nx-1)*sx2 + (ny-1)*sy2) / (nx+ny-2)) if (nx+ny-2)>0 else np.nan
    if pooled == 0 or np.isnan(pooled):
        return np.nan
    return (mx - my) / pooled

rows = []
for score in scores:
    r_row = {'score': score}

    # Pearson correlation GA -score
    sub_pre = df[df["prem_class"].isin(["28-32"])][[score, ga_col]].dropna()
    if sub_pre.shape[0] >= 4:
        r_pre, p_pre = pearsonr(sub_pre[ga_col], sub_pre[score])
    else:
        r_pre, p_pre = np.nan, np.nan
    r_row['pearson_r_ga_pre'] = r_pre
    r_row['pearson_p_ga_pre'] = p_pre
    r_row['n_pre_corr'] = len(sub_pre)

    # pearson on fullterms
    sub_smallpreterm = df[df["prem_class"].isin(["32-37"])][[score, ga_col]].dropna()
    if sub_smallpreterm.shape[0] >= 4:
        r_term, p_term = pearsonr(sub_smallpreterm[ga_col], sub_smallpreterm[score])
    else:
        r_term, p_term = np.nan, np.nan
    r_row['pearson_r_ga_smallpre'] = r_term
    r_row['pearson_p_ga_smallpre'] = p_term
    r_row['n_smallpre_corr'] = len(sub_smallpreterm)

    # Welch t-test preterms vs terms
    sub_t = df[[score, prem_binary_col]].dropna()
    grp_pre  = sub_t[sub_t[prem_binary_col] == 1][score].values
    grp_term = sub_t[sub_t[prem_binary_col] == 0][score].values
    r_row['n_pre']  = len(grp_pre)
    r_row['n_term'] = len(grp_term)

    if len(grp_pre) >= 3 and len(grp_term) >= 3:
        t_stat, p_t = ttest_ind(grp_pre, grp_term, equal_var=False, nan_policy='omit')
        d = cohen_d(grp_pre, grp_term)
    else:
        t_stat, p_t, d = np.nan, np.nan, np.nan

    r_row['t_stat_pre_vs_term'] = t_stat
    r_row['p_t_pre_vs_term'] = p_t
    r_row['cohen_d_pre_vs_term'] = d

    rows.append(r_row)

res_df = pd.DataFrame(rows)
res_df.to_csv(out_csv, index=False)
print("Saved summary to", out_csv)


Loaded: (9856, 14)
Saved summary to prematurity_stats_simple.csv
