In [1]:
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from scipy.stats import pearsonr
from sklearn.model_selection import (
    KFold,
    GridSearchCV,
    cross_val_score,
    cross_val_predict
)

In [3]:
def get_region_list(base_path):
    return sorted([
        d for d in os.listdir(base_path)
        if os.path.isdir(os.path.join(base_path, d))
           and not d.startswith('all_models')
           and not d.startswith('hcp')
           and not d.startswith('ukb')
           and not d.endswith('.csv')
           and not d.endswith('.sh')
           and not d.endswith('embeddings')
    ])

In [5]:
# CONFIG
labels_path = "/neurospin/dico/rmenasria/Runs/03_main/Input/cognition/nc_y_nihtb.csv"
base_path   = "/neurospin/dico/data/deep_folding/current/models/Champollion_V1_after_ablation/embeddings/ABCD_embeddings/"

scores = [
 'nihtbx_flanker_agecorrected',
 'nihtbx_cardsort_agecorrected',
 'nihtbx_list_agecorrected',
 'nihtbx_pattern_agecorrected',
 'nihtbx_picvocab_agecorrected',
 'nihtbx_reading_agecorrected',
 'nihtbx_fluidcomp_agecorrected',
 'nihtbx_cryst_agecorrected',
 'nihtbx_totalcomp_agecorrected'
]

labels_df = pd.read_csv(labels_path)
print(labels_df.shape)
labels_df = labels_df[labels_df["eventname"] == "baseline_year_1_arm_1"]
print(labels_df.shape)

existing_scores = [s for s in scores if s in labels_df.columns]
cols = ["src_subject_id"] + existing_scores
labels_df = labels_df[cols]
print(labels_df.index)

regions = get_region_list(os.path.join(base_path,"../.."))
regions.remove("params_OLS")
regions.remove("region_list")
regions.remove("analysis")
#regions = ["STs_right","FCLp-subsc-FCLa-INSULA_right","FCMpost-SpC_right","STi-STs-STpol_right"]
#region = regions[0]

labels_df['src_subject_id'] = (
    labels_df['src_subject_id'].astype(str)
    .str.replace(r"^sub-", "", regex=True)
    .str.replace("_", "", regex=False)
)

df_ages = pd.read_csv("/neurospin/dico/rmenasria/Runs/03_main/Input/ABCD/prematurity_labels_true_classes.csv")

(27028, 120)
(11727, 120)
Index([    0,     1,     3,     5,     7,    10,    13,    15,    18,    20,
       ...
       27001, 27003, 27006, 27008, 27010, 27012, 27015, 27018, 27021, 27024],
      dtype='int64', length=11727)


  labels_df = pd.read_csv(labels_path)


In [6]:
def load_embeddings(region):
    
    for file in os.listdir(base_path):
            if file.startswith(region) and file.endswith(".csv"):
                embedding_file = file
                break
            
    if embedding_file is None:
        raise FileNotFoundError(f"No embedding file found for region: {region}")

    print(f"Using embedding file: {embedding_file}")

    emb_path = os.path.join(base_path, embedding_file)
    emb_df = pd.read_csv(emb_path)
    emb_df['ID_clean'] = (
        emb_df['ID'].astype(str)
        .str.replace(r"^sub-", "", regex=True)
        .str.replace("_", "", regex=False)
    )
    return emb_df

In [24]:
def preprocess_data(df,prem_class,score):
    
    df_with_cognition = df.merge(
        labels_df,
        left_on='ID_clean', right_on='src_subject_id', how='inner'
    )
    df_with_cognition_and_ages= df_with_cognition.merge(df_ages, left_on='src_subject_id', right_on='src_subject_id', how='inner')
    print(df.columns)
    df_with_cognition_and_ages= df_with_cognition_and_ages[df_with_cognition_and_ages["prem_class"].isin(prem_class)]
    print("test",df_with_cognition_and_ages.shape)

    df_with_cognition_and_ages['y'] = df_with_cognition_and_ages[score]
    df_with_cognition_and_ages.dropna(subset=[score], inplace=True)

    X = df_with_cognition_and_ages.filter(regex=r'^dim').values
    y = df_with_cognition_and_ages['y'].values
    print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

    return X, y, df_with_cognition_and_ages
    


In [25]:
def regress_on_cognition(score,region,prem_class):
   
    emb_df = load_embeddings(region)
    X,y, df_aligned  = preprocess_data(emb_df, prem_class, score)


    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor',Ridge())
    ])

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    grid = GridSearchCV(
        pipe, {'regressor__alpha': [0.01, 0.1, 1, 10]},
        cv=cv, scoring='r2', n_jobs=-1
    )
    grid.fit(X, y)
    best_C = grid.best_params_['regressor__alpha']
    cv_scores = cross_val_score(grid, X, y,
                                cv=cv, scoring='r2', n_jobs=-1)
    print(f"Results for region {region} and score {score}:")
    print(cv_scores.mean(), cv_scores.std())

    # cross-validated predictions for pearson
    y_pred = cross_val_predict(grid.best_estimator_, X, y, cv=cv, n_jobs=-1)
    
     # keep the subject IDs

    ranked = pd.DataFrame({
        "subject_id": df_aligned["ID_clean"].values, 
        "y_true": y,
        "y_pred": y_pred
    }).sort_values("y_pred", ascending=False)

    # add general info 
    ranked["region"] = region
    ranked["score"] = score
    ranked["best_alpha"] = best_C
    ranked["n"] = len(y)
    
    return ranked


In [31]:
score = "nihtbx_picvocab_agecorrected"
region = "FColl-SRh_right"
#region = "FPO-SCu-ScCal_right"
#score = "nihtbx_pattern_agecorrected"

scores = regress_on_cognition(score,region,[">=37"])
scores.to_csv(f"/neurospin/dico/rmenasria/Runs/03_main/Output/csv/cognitive/direction/direction_fullterms_{score}_{region}.csv")

Using embedding file: FColl-SRh_right_name06-56-15--113_embeddings.csv
Index(['ID', 'dim1', 'dim2', 'dim3', 'dim4', 'dim5', 'dim6', 'dim7', 'dim8',
       'dim9', 'dim10', 'dim11', 'dim12', 'dim13', 'dim14', 'dim15', 'dim16',
       'dim17', 'dim18', 'dim19', 'dim20', 'dim21', 'dim22', 'dim23', 'dim24',
       'dim25', 'dim26', 'dim27', 'dim28', 'dim29', 'dim30', 'dim31', 'dim32',
       'ID_clean'],
      dtype='object')
test (8532, 48)
Shape of X: (8528, 32), Shape of y: (8528,)
Results for region FColl-SRh_right and score nihtbx_picvocab_agecorrected:
0.03692950523899778 0.010620843436806119


: 