In [1]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm

# === Metadata setup ===
metadata_df = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/UkBioBank40/log_thickness_surface_Age.csv")
metadata_df = metadata_df.set_index("participant_id")
covariates = ['log_surface_total', 'log_surface_hull', 'log_thickness_freesurfer']

# === Paths ===
root_dir = "/neurospin/dico/babdelghani/Runs/02_champollion_v1/Output/SIMCLR/OCCIPITAL_left_32_16"
resid_dir = os.path.join(root_dir, "residualized")
os.makedirs(resid_dir, exist_ok=True)

# === Loop through embeddings ===
for hp_config in sorted(os.listdir(root_dir)):
    hp_path = os.path.join(root_dir, hp_config)
    if not os.path.isdir(hp_path) or hp_config == "residualized":
        continue

    for date_dir in os.listdir(hp_path):
        date_path = os.path.join(hp_path, date_dir)
        if not os.path.isdir(date_path):
            continue

        for run_dir in os.listdir(date_path):
            run_path = os.path.join(date_path, run_dir)
            embeddings_path = os.path.join(run_path, "ukb40_FCLp_no_classiffier_random_embeddings")
            csv_file = os.path.join(embeddings_path, "full_embeddings.csv")

            if not os.path.isfile(csv_file):
                print(f"⚠️  File not found: {csv_file}")
                continue

            try:
                print(f"\n➡️  Processing: {csv_file}")
                embeddings = pd.read_csv(csv_file)
                embeddings = embeddings.set_index("ID")

                # Find common subjects
                common_ids = embeddings.index.intersection(metadata_df.index)
                metadata_clean = metadata_df.loc[common_ids, covariates].dropna()

                embeddings = embeddings.loc[metadata_clean.index]
                metadata_clean = metadata_clean.loc[embeddings.index]

                X = sm.add_constant(metadata_clean.values)

                # Residualization
                residuals_list = []
                for col in embeddings.columns:
                    y = embeddings[col].values
                    model = sm.OLS(y, X).fit()
                    y_resid = model.resid
                    residuals_list.append(pd.Series(y_resid, index=metadata_clean.index, name=col))

                # Save
                residualized = pd.concat(residuals_list, axis=1)
                residualized.index.name = "ID"

                save_name = f"{hp_config}_{date_dir}_{run_dir}.csv"
                save_path = os.path.join(resid_dir, save_name)
                residualized.to_csv(save_path)
                print(f"Saved residualized embeddings to {save_path}")

            except Exception as e:
                print(f"Error processing {csv_file}: {e}")

print("\n All residualizations complete.")


⚠️  File not found: /neurospin/dico/babdelghani/Runs/02_champollion_v1/Output/SIMCLR/OCCIPITAL_left_32_16/batch_16_seed_128/2025-08-24/multirun.yaml/ukb40_FCLp_no_classiffier_random_embeddings/full_embeddings.csv

➡️  Processing: /neurospin/dico/babdelghani/Runs/02_champollion_v1/Output/SIMCLR/OCCIPITAL_left_32_16/batch_16_seed_128/2025-08-24/21-13-46_0/ukb40_FCLp_no_classiffier_random_embeddings/full_embeddings.csv
Saved residualized embeddings to /neurospin/dico/babdelghani/Runs/02_champollion_v1/Output/SIMCLR/OCCIPITAL_left_32_16/residualized/batch_16_seed_128_2025-08-24_21-13-46_0.csv
⚠️  File not found: /neurospin/dico/babdelghani/Runs/02_champollion_v1/Output/SIMCLR/OCCIPITAL_left_32_16/batch_16_seed_212/2025-08-24/multirun.yaml/ukb40_FCLp_no_classiffier_random_embeddings/full_embeddings.csv

➡️  Processing: /neurospin/dico/babdelghani/Runs/02_champollion_v1/Output/SIMCLR/OCCIPITAL_left_32_16/batch_16_seed_212/2025-08-24/21-13-46_0/ukb40_FCLp_no_classiffier_random_embeddings/full