In [1]:
import pandas as pd
import numpy as np
import datetime
from sklearn.cross_decomposition import CCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import CCAtools
import seaborn as sns

In [2]:
path2PermCCA = "PermCCA/"  ### https://github.com/andersonwinkler/PermCCA
### note environment requirement being phased out ^^
palmPath = "palm-alpha119_2/"  ### path to palm
### see https://github.com/andersonwinkler/PALM

In [None]:
import matlab.engine as mlab_eng
import matlab

eng = mlab_eng.start_matlab()
eng.addpath(path2PermCCA)
eng.addpath(palmPath)

In [4]:
from CCAtools.preprocessing import (
    prep_confounds,
    cube_root,
    gauss_SM,
    zscore,
    normal_eqn_python,
)

In [5]:
### set paths
restricted_hcp_path = ""  ## set path to your copy of hcp restricted data
hcp_behavior_data = ""  ### set path to your copy of hcp behvaioral data
data_dir = ""  ### set to directory for data used in/throughout analysis

In [6]:
#### load behavioral data
### this is HCP restricted data. for access see
### https://www.humanconnectome.org/study/hcp-young-adult/document/restricted-data-usage
RestrictedData = pd.read_csv(restricted_hcp_path, index_col="Subject")
### load non-restricted hcp behavioral data
BehData = pd.read_csv(hcp_behavior_data, index_col="Subject")
### merge the dtaframes
fullData = BehData.merge(RestrictedData, on="Subject")
fullData.index = fullData.index.map(str)
fullData.index = fullData.index.map(str)
### load in square_root of total hemisphere surface area for each subject
Larea = pd.read_csv(f"{data_dir}LareaFactors.csv")
Larea.rename(index={0: "Larea"}, inplace=True)
Rarea = pd.read_csv(f"{data_dir}RareaFactors.csv")
Rarea.rename(index={0: "Rarea"}, inplace=True)
area = pd.concat([Larea, Rarea]).T
area.index.names = ["Subject"]
fullData = area.join(fullData, on="Subject", how="inner")

In [7]:
#### remove subjects which throw off permuations in later steps.
### see: doi.org/10.1016/j.neuroimage.2015.05.092
fullData.drop(["168240", "376247"], inplace=True)
Larea.T.drop(["168240", "376247"], inplace=True)
Rarea.T.drop(["168240", "376247"], inplace=True)
area.drop(["168240", "376247"], inplace=True)

In [8]:
conf_categories = fullData[
    [
        "Acquisition",
        "Gender",
        "Age_in_Yrs",
        "Height",
        "Weight",
        "BPSystolic",
        "BPDiastolic",
        "FS_IntraCranial_Vol",
        "FS_BrainSeg_Vol",
        "Larea",
        "Rarea",
    ]
]


Tasks = [
    "CardSort_Unadj",
    "DDisc_AUC_200",
    "DDisc_AUC_40K",
    "Dexterity_Unadj",
    "ER40ANG",
    "ER40FEAR",
    "ER40NOE",
    "ER40SAD",
    "ER40_CR",
    "Flanker_Unadj",
    "IWRD_TOT",
    "ListSort_Unadj",
    "MMSE_Score",
    "Mars_Final",
    "Odor_Unadj",
    "PMAT24_A_CR",
    "PicSeq_Unadj",
    "PicVocab_Unadj",
    "ProcSpeed_Unadj",
    "ReadEng_Unadj",
    "SCPT_SEN",
    "SCPT_SPEC",
    "Taste_Unadj",
    "VSPLOT_CRTE",
    "Noise_Comp",
    "EVA_Denom",
]

### note we drop subjects with nans in any of the above tasks ^^
### this was to ensure we only used well sampled subjects on all facets and could use them for later studies

TaskOfInterest = ["ReadEng_Unadj"]
subjSM = fullData[Tasks]


full_subjList = conf_categories.merge(subjSM, on="Subject").dropna().index
full_subjList.to_series().to_csv("SubjectInclusionList.csv")

conf_categories = conf_categories.loc[full_subjList]
subjSM = subjSM.loc[full_subjList]

complete_data = fullData.loc[full_subjList]

### set up the cross validation groups

In [9]:
def create_cross_validation_groups(groups, groupedData):
    """
    This function creates cross-validation groups from a list of lists, returning a dictionary
    where each key is a fold number and each value is another dictionary with keys for the training
    and test sets for that fold.

    :param groups: A list of lists, where each sub-list contains subjects.
    :return: A dictionary with fold numbers as keys and dictionaries of training and test sets as values.
    """
    cv_folds = {}
    for i in range(len(groups)):
        test_set = groups[i]
        test_set = pd.concat([groupedData.get_group(fam) for fam in test_set]).index
        training_set = [
            subject for j, group in enumerate(groups) if j != i for subject in group
        ]
        training_set = pd.concat(
            [groupedData.get_group(fam) for fam in training_set]
        ).index
        fold_key = f"fold{i + 1}"
        cv_folds[fold_key] = {"training": list(training_set), "testing": list(test_set)}
    return cv_folds

In [10]:
random_seeds = [0, 42, 19, 123, 10, 69, 33, 1, 1234, 9245]

In [11]:
### create 10 x 10K fold cross validation i.e. 100 unique training and testing folds
nfolds = 10
DataGroups = []
FamilyGroups = []
for seed in random_seeds:
    ### create cross validation respecting family groups
    ### in this case we'll do 10 fold cross-validation 10 times
    np.random.seed(seed)
    if seed == 0:
        print("first iteration use data as is")
        fam_ids = np.unique(complete_data["Family_ID"])
        family_groups = np.array_split(fam_ids, nfolds)
        ### group subjects by family ID
        grouped_data = complete_data.groupby("Family_ID")
        DataGroups.append(grouped_data)
        FamilyGroups.append(family_groups)
    else:
        np.random.shuffle(fam_ids)
        family_groups = np.array_split(fam_ids, nfolds)
        ### group subjects by family ID
        grouped_data = complete_data.groupby("Family_ID")
        DataGroups.append(grouped_data)
        FamilyGroups.append(family_groups)

first iteration use data as is


In [12]:
fold_Sets = {}
x = 1
for i, j in zip(FamilyGroups, DataGroups):
    folds = create_cross_validation_groups(i, j)
    fold_Sets[f"fold set {x}"] = folds
    x = x + 1

In [13]:
import json

In [14]:
# combine all folds into single dictionary
all_folds = {}
# Keep track of the fold number
fold_number = 1
# Loop through each fold set
for fold_set_name, folds in fold_Sets.items():
    # Loop through each sub-fold within the fold set
    for fold_name, fold_data in folds.items():
        # Construct a unique name for the fold and add it to the combined folds dictionary
        all_folds[f"fold{fold_number}"] = fold_data
        # Increment the fold number for the next iteration
        fold_number += 1

# Now, combined_folds is a single dictionary with 100 entries
# Specify the filename you want to save the JSON data to
filename = "cross_validation_folds.json"
# Write the JSON data to a file
with open(filename, "w") as f:
    json.dump(all_folds, f, indent=4)

# Finished 
Continue to notebook 2