# 01. Data-Setup

Aim to run through the entire notebook in order to generate the appropriate pickle file which will be used in later notebooks.


In [None]:
import os
import re
import pandas as pd
import numpy as np
from src.graph_theory import append_connectome_data, append_gt_data
from scipy.signal import resample

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.vectors import FloatVector, StrVector
from rpy2.robjects.packages import importr

MCI_IS_AD = True
NULL_MODEL = False

### Load Subject Information


In [None]:
data_path = os.getcwd() + "/data"
subdirectories = re.compile(r"^(ADNI|MUSC|TBI|MCI)-(.*)-n\d+$")

subject_info_df = pd.read_csv(data_path + "/subjects.csv")
subject_info_df['Subject_ID'] = subject_info_df['Subject'].str.extract(r'(\d+)$')
subject_info_df['Site'] = subject_info_df['Subject'].str.extract(r'^(\d+)_')


# MUSC site is 999
subject_info_df.loc[(subject_info_df['Site'].isna()) & (subject_info_df['Study'] == 'C4D'), 'Site'] = 999
subject_info_df.loc[(subject_info_df['Site'].isna()) & (subject_info_df['Study'] == 'IAM'), 'Site'] = 999
subject_info_df.loc[(subject_info_df['Site'].isna()) & (subject_info_df['Study'] == 'MUSC'), 'Site'] = 999
subject_info_df.loc[(subject_info_df['Site'].isna()) & (subject_info_df['Study'] == 'DOD'), 'Site'] = 888

if MCI_IS_AD:
    # Treat MCI as AD for classification
    subject_info_df.loc[subject_info_df['Diagnosis'] == 'MCI', 'Diagnosis'] = 'AD'

print(subject_info_df.to_markdown())

### Load Subject Data


In [None]:
# data directory has TBI/MUSC/ADNI subdirectories, POS/NEG each
#if "data_df" not in locals():
columns = ["Subject_ID", "Diagnosis", "Study", "Data"]
data_df = pd.DataFrame(columns=columns)

for root, dirs, files in os.walk(data_path):
    match = subdirectories.search(os.path.basename(root))
    if match:
        study = match.group(1)
        diagnosis = match.group(2)
        if MCI_IS_AD and diagnosis == 'MCI':
            diagnosis = 'AD'

        for file in files:
            if file.endswith(".csv") and file != "subjects.csv":
                file_path = os.path.join(root, file)

                subject_id_match = re.search(r"(\d+).tts_all\.csv", file)
                if subject_id_match:
                    subject_id = subject_id_match.group(1).replace('_','').replace('-','')

                    # Smallest n observations is 140
                    data_matrix = pd.read_csv(file_path).values
                    if data_matrix.shape[0] > 140:
                        data_matrix = resample(data_matrix, 140)

                    # Find the row associated with this subject in subject_info_df
                    # Extract Age (yrs) and Sex
                    # Lookup by Subject_ID column
                    # Append this information to new_row as "Age" and "Sex"
                    subject_row = subject_info_df.loc[subject_info_df['Subject_ID'] == subject_id]
                    age = subject_row['Age (yrs)'].values[0] if not subject_row.empty else None
                    sex = subject_row['Sex'].values[0] if not subject_row.empty else None
                    site = subject_row['Site'].values[0] if not subject_row.empty else None

                    new_row = {
                        "Subject_ID": subject_id,
                        "Diagnosis": diagnosis,
                        "Study": study,
                        "Age": age,
                        "Sex": sex,
                        "Site": site,
                        "Data": data_matrix,
                    }
                    data_df = pd.concat(
                        [data_df, pd.DataFrame([new_row])], ignore_index=True
                        )
data_df = append_connectome_data(data_df)
print(data_df.loc[:, (data_df.columns != 'Data') & (data_df.columns != 'Connectome')].to_markdown())

### Harmonization


In [None]:
# Harmonize those sites with more than 2 subjects
site_counts = data_df.loc[data_df['Site'] != 888, 'Site'].value_counts()
valid_sites = site_counts[site_counts > 2].index
harmonize_data = data_df[(data_df['Site'].isin(valid_sites)) & (data_df['Site'] != 888)].copy()
harmonize_data['Site'] = harmonize_data['Site'].astype(str)

# Unpack connectome edges into a vector
def upper_triangle_flatten(matrix):
    """Extracts the upper triangle (excluding diagonal) and flattens it."""
    return matrix[np.triu_indices(matrix.shape[0], k=1)]

n_features = harmonize_data['Connectome'][0].shape[1]
harmonize_data['Connectome'] = harmonize_data['Connectome'].apply(upper_triangle_flatten)
connectome_matrix = np.vstack(harmonize_data['Connectome'].values)
harmonize_data

In [None]:
DO_HARMONY = False
if DO_HARMONY:
    pandas2ri.activate()
    comfam_path = "./src/R/comfam.R"
    ro.r['source'](comfam_path)

    comfam = ro.globalenv['comfam']

    num_subjects, num_features = connectome_matrix.shape

    connectome_r = ro.r.matrix(FloatVector(connectome_matrix.flatten()), nrow=num_subjects, ncol=num_features, byrow=True)
    site_r = ro.r['factor'](StrVector(harmonize_data['Site'].astype(str).values))

    age_r = FloatVector(harmonize_data['Age'].astype(float).values)
    sex_r = ro.r['factor'](StrVector(harmonize_data['Sex'].astype(str).values))
    covar_df_r = ro.DataFrame({'Age': age_r, 'Sex': sex_r})
    formula_r = ro.r('y ~ Age + Sex')

    comfam = ro.globalenv['comfam']
    com_out = comfam(connectome_r, site_r, covar=covar_df_r, model=ro.r['lm'], formula=formula_r)

    # Apply weights to TBI data
    tbi_data = data_df[data_df['Site'] == 888].copy()
    tbi_data['Site'] = tbi_data['Site'].astype(str)
    tbi_data['Connectome'] = tbi_data['Connectome'].apply(upper_triangle_flatten)
    tbi_connectome_matrix = np.vstack(tbi_data['Connectome'].values)

    num_tbi_subjects, num_features = tbi_connectome_matrix.shape
    tbi_connectome_r = ro.r.matrix(FloatVector(tbi_connectome_matrix.flatten()), nrow=num_tbi_subjects, ncol=num_features, byrow=True)
    tbi_site_r = ro.r['factor'](StrVector(tbi_data['Site'].astype(str).values))

    age_r = FloatVector(tbi_data['Age'].astype(float).values)
    sex_r = ro.r['factor'](StrVector(tbi_data['Sex'].astype(str).values))
    covar_df_r = ro.DataFrame({'Age': age_r, 'Sex': sex_r})

    comfam_predict = ro.r['predict']
    tbi_harmonized_r = comfam_predict(com_out, tbi_connectome_r, tbi_site_r, newcovar=covar_df_r)
    tbi_harmonized = np.array(tbi_harmonized_r.rx2('dat.combat'))

In [None]:
harmonize_data

In [None]:
def reconstruct_connectome(vector, size=n_features):
    """Reconstructs a full connectome from a flattened upper triangle vector."""
    matrix = np.zeros((size, size))
    upper_indices = np.triu_indices(size, k=1)
    matrix[upper_indices] = vector
    matrix += matrix.T
    return matrix

if DO_HARMONY:
    harmonized_connectome = np.array(com_out.rx2('dat.combat'))
else:
    harmonized_connectome = harmonize_data["Connectome"]

reconstructed_connectomes = [reconstruct_connectome(vec) for vec in harmonized_connectome]
harmonized_dict = dict(zip(harmonize_data['Subject_ID'], reconstructed_connectomes))
data_df['Harmonized'] = data_df['Subject_ID'].map(harmonized_dict)

if DO_HARMONY:
    reconstructed_tbi = [reconstruct_connectome(vec) for vec in tbi_harmonized]
    tbi_dict = dict(zip(tbi_data['Subject_ID'], reconstructed_tbi))
    data_df['Harmonized'] = data_df['Harmonized'].combine_first(data_df['Subject_ID'].map(tbi_dict))

data_df = append_gt_data(data_df, harmonized=True)

if NULL_MODEL:
    data_df['Diagnosis'] = np.random.permutation(data_df['Diagnosis'].values)
    data_df['Study'] = np.random.permutation(data_df['Study'].values)

clone_df = data_df.copy()
shapes = [x.shape if isinstance(x, np.ndarray) else None for x in clone_df['Harmonized']]
clone_df['Harmonized_Shape'] = shapes
clone_df = clone_df.drop(['Harmonized', 'Connectome', 'Data', 'EVC', 'CLU', 'DIV'], axis=1)

print(clone_df.to_markdown())

In [None]:
# Drop all but significant nodes from the graph theory metrics
# WRITES IN PLACE!
# print(data_df['EVC'][0].shape)

# indices_to_keep = [1, 4, 77, 80, 114, 160]

# def filter_array(arr):
#     return arr[indices_to_keep]

# data_df["EVC"] = data_df["EVC"].apply(lambda x: filter_array(x) if x is not None else None)
# data_df["CLU"] = data_df["CLU"].apply(lambda x: filter_array(x) if x is not None else None)
# data_df["DIV"] = data_df
# ["DIV"].apply(lambda x: filter_array(x) if x is not None else None)

# print(data_df['EVC'][0].shape)

In [None]:
data_df.to_pickle('./data/data.pkl')