In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.cross_decomposition import CCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import CCAtools
import seaborn as sns

In [2]:
path2PermCCA = "PermCCA/"  ### https://github.com/andersonwinkler/PermCCA
### note environment requirement being phased out ^^
palmPath = "palm-alpha119_2/"  ### path to palm
### see https://github.com/andersonwinkler/PALM

In [None]:
import matlab.engine as mlab_eng
import matlab

eng = mlab_eng.start_matlab()
eng.addpath(path2PermCCA)
eng.addpath(palmPath)

In [4]:
from CCAtools.preprocessing import (
    prep_confounds,
    cube_root,
    gauss_SM,
    zscore,
    normal_eqn_python,
)

In [5]:
### set paths
restricted_hcp_path = ""  ## set path to your copy of hcp restricted data
hcp_behavior_data = ""  ### set path to your copy of hcp behvaioral data
data_dir = ""  ### set to directory for data used in/throughout analysis

In [6]:
#### load behavioral data
### this is HCP restricted data. for access see
### https://www.humanconnectome.org/study/hcp-young-adult/document/restricted-data-usage
RestrictedData = pd.read_csv(restricted_hcp_path, index_col="Subject")
### load non-restricted hcp behavioral data
BehData = pd.read_csv(hcp_behavior_data, index_col="Subject")
### merge the dtaframes
fullData = BehData.merge(RestrictedData, on="Subject")
fullData.index = fullData.index.map(str)
fullData.index = fullData.index.map(str)
### load in square_root of total hemisphere surface area for each subject
Larea = pd.read_csv(f"{data_dir}LareaFactors.csv")
Larea.rename(index={0: "Larea"}, inplace=True)
Rarea = pd.read_csv(f"{data_dir}RareaFactors.csv")
Rarea.rename(index={0: "Rarea"}, inplace=True)
area = pd.concat([Larea, Rarea]).T
area.index.names = ["Subject"]
fullData = area.join(fullData, on="Subject", how="inner")

In [7]:
#### Load Distances
LSchaeferDist = pd.read_csv(f"{data_dir}/Schaefer400Geodesic.L.mat.csv")
LSchaeferDist.rename(columns={"Unnamed: 0": "Subject"}, inplace=True)
LSchaeferDist.set_index("Subject", inplace=True)

RSchaeferDist = pd.read_csv(f"{data_dir}/Schaefer400Geodesic.R.mat.csv")
RSchaeferDist.rename(columns={"Unnamed: 0": "Subject"}, inplace=True)
RSchaeferDist.set_index("Subject", inplace=True)
SchaeferDist = pd.concat([LSchaeferDist, RSchaeferDist], axis=1)
SchaeferDist.index = SchaeferDist.index.map(str)

In [8]:
#### remove a few subjects that don't have all the data to generate the right permutations
SchaeferDist.drop(["168240", "376247"], inplace=True)
fullData.drop(["168240", "376247"], inplace=True)
Larea.T.drop(["168240", "376247"], inplace=True)
Rarea.T.drop(["168240", "376247"], inplace=True)
area.drop(["168240", "376247"], inplace=True)

In [9]:
##### preprocessing functions we'll add to a package later
from CCAtools.preprocessing import zscore


def prep_confounds_local(confs, eng):
    """set the confounds up with gaussianization and normalization as done by smith et al 2015."""
    assert ("palm" in eng.path()) == True, "add PermCCA to your matlab path"
    mat_data = matlab.double(
        confs.values.tolist()
    )  ### they actually included the binary acquisition data in the gaussianization
    print("gaussianizing")
    gaussed = np.asarray(eng.palm_inormal(mat_data))
    squared = gaussed[:, 1:] ** 2
    ready_confs = np.hstack([gaussed, squared])
    ready_confs = zscore(np.hstack([gaussed, squared]), ax=0)

    return ready_confs


def set_confounds(data, mlab_eng):
    """takes in a full data set of all HCP variables and extracts and preprocesses confounds to be regressed"""
    eng = mlab_eng
    confounds = data
    gend = LabelEncoder().fit_transform(confounds["Gender"])
    acq = LabelEncoder().fit_transform(confounds["Acquisition"])
    acq[acq < 2] = 0
    acq[acq > 0] = 1
    df = confounds.copy()
    df["Acquisition"] = acq
    df["Gender"] = gend
    df["FS_IntraCranial_Vol"] = data["FS_IntraCranial_Vol"].map(cube_root)
    df["FS_BrainSeg_Vol"] = data["FS_BrainSeg_Vol"].map(cube_root)
    df["Larea"] = data["Larea"].map(np.sqrt)
    df["Rarea"] = data["Rarea"].map(np.sqrt)
    confounds = prep_confounds_local(df, eng)
    return confounds


def preprocess_SM(data, confs, mlab_eng):
    """preprocess the subject measures. Guassianize and remove confounds."""
    eng = mlab_eng
    assert ("palm" in eng.path()) == True, "add PermCCA to your matlab path"
    data = data
    gaussed = gauss_SM(data, eng)
    residuals = normal_eqn_python(confs, gaussed)
    cleaned = zscore(residuals)
    cleaned = pd.DataFrame(cleaned, index=data.index, columns=data.columns)
    return cleaned


def preprocessDists(data, confounds):
    NET = data.copy()
    dims = NET.shape
    ##### check for vertices with no variance i.e guaranteed masks
    steady_masks = np.where(np.sum(NET) == 0)[0]
    valididx = np.where(np.sum(NET) != 0)[0]

    if len(steady_masks) != 0:
        NET = NET.iloc[:, valididx]

    #     amNET = np.abs(np.nanmean(NET, axis=0))
    NET1 = NET  # /amNET
    NET1 = NET1 - np.mean(NET1, axis=0)
    NET1 = NET1 / np.nanstd(NET1.values.flatten())
    NET1 = normal_eqn_python(confounds, NET1)
    NET1 = pd.DataFrame(NET1, columns=NET.columns, index=data.index)

    if len(steady_masks) != 0:
        out = np.zeros(dims)
        out[:, valididx] = NET1.values
        NET1 = pd.DataFrame(out, index=NET.index)

    return NET1


def clean_data(subjectList, all_confs, all_SM, all_dist, mlab=eng):
    ### remove confounds for a group of subjects
    ### always done independently to avoid leakage

    ## set the confounds
    confs = all_confs.loc[subjectList]
    confs = set_confounds(confs, mlab)
    ## regress them from behavioral measures
    behavior = all_SM.loc[subjectList]
    behavior = preprocess_SM(behavior, confs, mlab)
    ## regress them from distance measures
    distance = all_dist.loc[subjectList]
    distance = preprocessDists(distance, confs)

    return confs, behavior, distance

#### Extract confounds and behaviors of interest from raw data

In [10]:
### set up behavioral / confound data
conf_categories = fullData[
    [
        "Acquisition",
        "Gender",
        "Age_in_Yrs",
        "Height",
        "Weight",
        "BPSystolic",
        "BPDiastolic",
        "FS_IntraCranial_Vol",
        "FS_BrainSeg_Vol",
        "Larea",
        "Rarea",
    ]
]
SensoryTasks = ["ReadEng_Unadj"]
subjSM = fullData[SensoryTasks]

full_subjList = conf_categories.merge(subjSM, on="Subject").dropna().index
full_subjList.to_series().to_csv("SubjectInclusionList.csv")

conf_categories = conf_categories.loc[full_subjList]
subjSM = subjSM.loc[full_subjList]
complete_data = fullData.loc[full_subjList]

### Correlation calculation for subsequent feature selection 
In this step we show an example of how correlations between the brain data (Distance or FC) are calculated within each training fold of the cross-validation folds


In [11]:
from scipy.stats import spearmanr
from multiprocessing import Pool
import multiprocessing
import numpy as np
import pandas as pd


# Adjusted function to compute Spearman's correlation for a single column.
# Now, it directly takes the column data as an argument instead of the column index and entire DataFrame.
def corr_single_column(column_data, beh_data, behavior):
    r, p = spearmanr(column_data, beh_data[behavior])
    return r, p


# Parallel version of corrDist2BEH that's more memory-efficient
def corrDist2BEH_parallel(dist, beh_data, behavior):
    num_processes = multiprocessing.cpu_count()  # Utilize all available cores

    # Prepare column data for each task to reduce memory usage
    # This time, instead of passing the whole DataFrame and column index,
    # pass the data of each column directly.
    tasks = [(dist.iloc[:, i].values, beh_data, behavior) for i in range(dist.shape[1])]

    # Create a pool of worker processes
    with Pool(processes=num_processes) as pool:
        # Execute the tasks in parallel and collect the results
        results = pool.starmap(corr_single_column, tasks)

    # Separate the results into correlation coefficients and p-values
    cr, p_vals = zip(*results)

    return np.asarray(cr), np.asarray(p_vals)

In [12]:
### confounds are removed within each training fold on both the brain and behaviour data
def clean_folds(cv_folds):
    training_data = {}
    testing_data = {}
    for fold in cv_folds:
        print(fold)

        ### training
        train_confounds, train_behavior, train_distance = clean_data(
            folds[fold]["training"], conf_categories, subjSM, SchaeferDist
        )
        data_dict = {"Dist": train_distance, "Beh": train_behavior}
        training_data[fold] = data_dict
        ### testing
        test_confounds, test_behavior, test_distance = clean_data(
            folds[fold]["testing"], conf_categories, subjSM, SchaeferDist
        )
        data_dict = {"Dist": test_distance, "Beh": test_behavior}
        testing_data[fold] = data_dict
    return training_data, testing_data

In [13]:
import json

# load the cross validation folds from nb1
f = open("cross_validation_folds.json")
# returns JSON object as
# a dictionary
cv_folds = json.load(f)

In [14]:
fold_json = list(cv_folds.keys())[0]
subjects = cv_folds[fold_json]["training"]

### run the spearman correlation on this fold of the training data
### save out the correlation and p-values for each fold
corrs = np.zeros([1, 39800])
p_vals = np.zeros([1, 39800])
for idx, task in enumerate(subjSM.columns):
    ### clean data for feature selection via spearman correlation
    _, beh, dist = clean_data(subjects, conf_categories, subjSM, SchaeferDist)
    rho, p = corrDist2BEH_parallel(dist, beh, task)
    corrs[idx, :] = rho
    p_vals[idx, :] = p
corrs = pd.DataFrame(corrs)
corrs.columns = dist.columns
corrs.index = beh.columns
p_vals = pd.DataFrame(p_vals)
p_vals.columns = dist.columns
p_vals.index = beh.columns
out_int = fold_json.split("d")[1]

gaussianizing


Note this notebook simply shows the set up. 
We actually use a script to run this in parallel on the cluster 

scripts associated are run_dist_corr_cpm_cluster.py and slurm_cpm.sh

## Feature selection
We'll use the correlations of brain data and reading now to extract our features

In [15]:
### return a set of edges present across folds by percentage.
### 100 would return only edges / distances present across all folds
def edges_in_threshold(EdgesPassingByTask, tasks_passing, threshold_percentage):
    """
    Find edges for each task present in at least threshold_percentage of folds.
    :param EdgesPassingByTask: Dictionary with fold keys and task-edge lists as values.
    :param tasks_passing: List of tasks to consider.
    :param threshold_percentage: The minimum percentage of folds an edge must appear in to be included.
    :return: A dictionary with tasks as keys and lists of edges meeting the threshold as values.
    """
    num_folds = len(EdgesPassingByTask)
    threshold_folds = threshold_percentage * num_folds / 100

    # Initialize a dictionary to store edges for each task with their fold presence count
    edge_presence_count = {task: {} for task in tasks_passing}

    # Count the occurrence of each edge across folds for each task
    for fold, tasks_edges in EdgesPassingByTask.items():
        for task, edges in tasks_edges.items():
            for edge in edges:
                if edge not in edge_presence_count[task]:
                    edge_presence_count[task][edge] = 1
                else:
                    edge_presence_count[task][edge] += 1

    # Filter edges based on threshold_folds and store in a final dictionary
    tasks_with_threshold_edges = {}
    for task, edges_counts in edge_presence_count.items():
        # Filter edges that meet or exceed the threshold
        threshold_edges = [
            edge for edge, count in edges_counts.items() if count >= threshold_folds
        ]
        tasks_with_threshold_edges[task] = threshold_edges

    return tasks_with_threshold_edges

### Threshold by Pvals and extract edges

In [16]:
def thresholdData(df, thr=0.01):
    """a function which applies a threshold to distance behavior associations
    default is 0.01, but passing 'fdr' will run multiple comparisons correctoin instead
    """
    if thr != "fdr":
        out = df < thr
    elif thr == "fdr":
        out = []
        for task in df.index:
            passed, _, _, _ = multipletests(df.loc[task], alpha=0.05, method="fdr_bh")
            out.append(passed)
        out = np.asarray(out)
        out = pd.DataFrame(out, index=df.index, columns=df.columns)
    return out

In [17]:
#### extract all data points that pass significance threshold of 0.01
def get_sig_corrs(pvaList):
    dataframes = []
    for index, p_vals in enumerate(pvaList, start=1):
        # Read the CSV file
        df_p = pd.read_csv(p_vals, index_col=0)
        # Apply thresholding function
        df_p = thresholdData(df_p, 0.01)
        dataframes.append(df_p)

    # Concatenate all dataframes horizontally
    p_fold = pd.concat(dataframes, axis=0)
    sig_edges = list(p_fold.columns[np.where(p_fold.sum(axis=0) == 100)[0]])
    return sig_edges


def consistent_signage(corrList, edgeList):
    dataframes = []
    for index, corrs in enumerate(corrList, start=1):
        dataframes.append(pd.read_csv(corrs, index_col=0))

    # Concatenate all dataframes horizontally
    r_fold = pd.concat(dataframes, axis=0)

    df = np.sign(r_fold[edgeList]).sum(axis=0)

    pos_dists = list(df[df > 1].index)
    neg_dists = list(df[df < 1].index)
    return pos_dists, neg_dists

In [None]:
prebakedPvalsDist=!ls {data_dir}/distCPMfeaturesReading/*pval*csv
prebakedCorrsDist=!ls  {data_dir}/distCPMfeaturesReading/*corr*csv
p_passed_EdgesDist=get_sig_corrs(prebakedPvalsDist)
positiveDistances,negativeDistances=consistent_signage(prebakedCorrsDist,p_passed_EdgesDist)

In [None]:
### save the outputs to a json we can use in the next notebook
filename = "positive_distances.json"
# Write the JSON data to a file
with open(filename, "w") as f:
    json.dump({"ReadEng_Unadj": positiveDistances}, f, indent=4)

filename = "negativeDistances.json"
# Write the JSON data to a file
with open(filename, "w") as f:
    json.dump({"ReadEng_Unadj": negativeDistances}, f, indent=4)

In [None]:
prebakedPvalsFC=!ls {data_dir}/funcCPMfeaturesReading/*pval*csv
prebakedCorrsFC=!ls {data_dir}/funcCPMfeaturesReading/*corr*csv
p_passed_EdgesFC=get_sig_corrs(prebakedPvalsFC)
positiveFC,negativeFC=consistent_signage(prebakedCorrsFC,p_passed_EdgesFC)

In [None]:
### save the outputs to a json we can use in the next notebook
filename = "positiveFC.json"
# Write the JSON data to a file
with open(filename, "w") as f:
    json.dump({"ReadEng_Unadj": positiveFC}, f, indent=4)

filename = "negativeFC.json"
# Write the JSON data to a file
with open(filename, "w") as f:
    json.dump({"ReadEng_Unadj": negativeFC}, f, indent=4)

### Finished
Move to notebook 3