In [3]:
import os
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

# === Paths ===
tiv_path = "/neurospin/dico/data/deep_folding/current/datasets/UkBioBank40/participants_tiv_volumes_normalized.csv"
embedding_folder = "/home/cb283697/Bureau/SC_right_ukb_embeddings_tiv/dim_32_tiv"
output_folder = embedding_folder + "_residualized"

# === Load TIV data ===
df_tiv = pd.read_csv(tiv_path)
df_tiv.columns = df_tiv.columns.str.strip().str.lower()
df_tiv = df_tiv.drop_duplicates(subset="participant_id").dropna(subset=["volume"])
df_tiv.set_index("participant_id", inplace=True)

# === Create output directory ===
os.makedirs(output_folder, exist_ok=True)

# === Function to residualize one embedding file ===
def residualize_embeddings(embedding_path, tiv_df, output_path):
    embeddings = pd.read_csv(embedding_path, index_col=0)

    # Match TIV rows
    embeddings = embeddings.loc[embeddings.index.intersection(tiv_df.index)]
    tiv = tiv_df.loc[embeddings.index][["volume"]]

    # Standardize both
    scaler = StandardScaler()
    embeddings_std = pd.DataFrame(
        scaler.fit_transform(embeddings),
        index=embeddings.index,
        columns=embeddings.columns
    )
    tiv_std = scaler.fit_transform(tiv)
    tiv_std = sm.add_constant(tiv_std)

    residualized = pd.DataFrame(index=embeddings_std.index, columns=embeddings_std.columns)

    for col in embeddings_std.columns:
        model = sm.OLS(embeddings_std[col], tiv_std).fit()
        residualized[col] = model.resid

    # Reset index so 'ID' becomes a proper column
    residualized = residualized.reset_index()
    residualized.rename(columns={'index': 'ID'}, inplace=True)

    residualized.to_csv(output_path, index=False)
    print(f"✅ Residualized and saved: {output_path}")

# === Process all embeddings ===
print(f"\n📁 Processing folder: {embedding_folder}")
for file in sorted(os.listdir(embedding_folder)):
    if file.endswith(".csv"):
        embed_path = os.path.join(embedding_folder, file)
        output_path = os.path.join(output_folder, file)
        residualize_embeddings(embed_path, df_tiv, output_path)

print("\n✅ All embeddings residualized by TIV with 'ID' column included.")



📁 Processing folder: /home/cb283697/Bureau/SC_right_ukb_embeddings_tiv/dim_32_tiv
✅ Residualized and saved: /home/cb283697/Bureau/SC_right_ukb_embeddings_tiv/dim_32_tiv_residualized/full_embeddings_dim_32_batch_128_sigma_0.01.csv
✅ Residualized and saved: /home/cb283697/Bureau/SC_right_ukb_embeddings_tiv/dim_32_tiv_residualized/full_embeddings_dim_32_batch_128_sigma_0.05.csv
✅ Residualized and saved: /home/cb283697/Bureau/SC_right_ukb_embeddings_tiv/dim_32_tiv_residualized/full_embeddings_dim_32_batch_64_sigma_0.01.csv
✅ Residualized and saved: /home/cb283697/Bureau/SC_right_ukb_embeddings_tiv/dim_32_tiv_residualized/full_embeddings_dim_32_batch_64_sigma_0.05.csv

✅ All embeddings residualized by TIV with 'ID' column included.


In [6]:
import pandas as pd

# === Paths ===
embedding_path = "/neurospin/dico/jlaval/Output/SC-sylv_right_V1/V1_TrimExtremities_p80/ukb40_random_embeddings/full_embeddings.csv"
tiv_path = "/neurospin/dico/data/deep_folding/current/datasets/UkBioBank40/participants_tiv_volumes_normalized.csv"
output_path = "/home/cb283697/Bureau/SC_right_ukb_embeddings_tiv/V1/full_embeddings_tiv.csv"

# === Load data ===
embeddings = pd.read_csv(embedding_path)
tiv = pd.read_csv(tiv_path)

# === Normalize column names ===
tiv.columns = tiv.columns.str.strip().str.lower()
tiv = tiv.drop_duplicates(subset="participant_id")

# === Merge on subject ID ===
merged = embeddings.merge(
    tiv[["participant_id", "volume"]],
    left_on="ID", right_on="participant_id", how="left"
)

# === Drop rows with missing TIV ===
merged = merged.dropna(subset=["volume"])

# === Drop duplicated column and rename ===
merged.drop(columns=["participant_id"], inplace=True)
merged.rename(columns={"volume": "dim33"}, inplace=True)

# === Save the result ===
merged.to_csv(output_path, index=False)
print(f"✅ Saved merged file with TIV to: {output_path}")


✅ Saved merged file with TIV to: /home/cb283697/Bureau/SC_right_ukb_embeddings_tiv/V1/full_embeddings_tiv.csv
