In [1]:
import os
os.chdir("../..")  # Move one level up  
print(os.getcwd())

/Users/malthepabst/Documents/GitHub/Thesis_Neurodiversity


In [174]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scripts.eda_func import plot_small_multiple_rois, test_dist, group_roi_heat_map

In [175]:
pd.options.mode.chained_assignment = None
meta_data = pd.read_csv(f'data.nosync/phenotypic/subjects_with_meta_{7}.csv',
                         index_col= 'Unnamed: 0')
meta_data['Sub ID'] = meta_data['Sub ID'].apply(lambda x: str(x).zfill(7))

#Get participants and their meta data
participants = meta_data.drop_duplicates(subset=['Sub ID', 'Dataset'])
participants['Co-Diagnosis'] = participants['Co-Diagnosis'].replace({np.nan: ''})
participants['Co-Diagnosis'] = participants['Co-Diagnosis'].apply(lambda x: '-'+x if x != '' else x)
participants['Diagnosis'] = participants['Diagnosis'] + participants['Co-Diagnosis']
participants['Diagnosis'] = participants['Diagnosis'].replace({'ASD-Other': 'ASD', 
                                                               'ADHD-Other': 'ADHD',
                                                               'TD-Other':'TD'})

to_remove = pd.read_csv('data.nosync/phenotypic/missing_metadata.csv')
to_remove = pd.concat([to_remove, pd.read_csv('data.nosync/stats/head_movement/motion_summary_all_subjects.csv')])
to_remove = to_remove[['Sub ID', 'Dataset', 'Exclude']]
to_remove = to_remove[to_remove['Exclude'] == True]
to_remove = to_remove.drop_duplicates(['Sub ID', 'Dataset'])
to_remove['Sub ID'] = to_remove['Sub ID'].apply(lambda x: str(x).zfill(7))

#Remove and select columns
participants['temp'] = list(zip(participants['Sub ID'],participants['Dataset']))
participants = participants[~participants['temp'].isin(list(zip(to_remove['Sub ID'],to_remove['Dataset'])))]
participants = participants[['Sub ID', 'Dataset', 'Age', 'Sex', 'IQ', 'Diagnosis']]


In [198]:
#Create bins
age_median = np.quantile(participants['Age'], 0.50)
iq_mean = 100

stratify_base = participants[['Age', 'Sex', 'IQ', 'Diagnosis', 'Sub ID', 'Dataset']].copy()
stratify_base['Age'] = stratify_base['Age'].apply(lambda x: 'q2' if x >= age_median
                                                            else 'q1') 
stratify_base['IQ'] = stratify_base['IQ'].apply(lambda x: 'q2' if x >= iq_mean
                                                            else 'q1')


train = []
test = []
val = []

priority = ['Diagnosis', 'Sex', 'Age', 'IQ']

for i in range(0,4):
    if i == 0:
        temp_prio = ['Diagnosis', 'Sex', 'Age', 'IQ']
    else:
        temp_prio = priority[:-i]
    
    #Get the groups and their count, and validation set size
    stratify_groups = stratify_base[temp_prio].groupby(temp_prio).value_counts().reset_index()
    stratify_groups['group'] = [i for i in range(len(stratify_groups))]
    stratify_groups['val_size'] = round(stratify_groups['count']*0.15,0)

    #Get those that can be stratified
    ready =  stratify_base.merge(stratify_groups[stratify_groups['count'] >= 3], 
                                 on=temp_prio, 
                                 how='inner')
    
    #Remove those that can be stratified
    stratify_base = stratify_base.merge(stratify_groups[stratify_groups['count'] < 3], 
                                    on=temp_prio, 
                                    how='inner')

    stratify_base = stratify_base[priority + ['Sub ID', 'Dataset']]
    
    #If less than three participants, stop and add to train
    if len(stratify_base) < 3:
        train.append(stratify_base)
        break
    
    #Make the split for the ready groups
    for idx, row in stratify_groups[stratify_groups['count'] >= 3].iterrows():
        ready_group = ready[ready['group'] == row['group']].reset_index()
        #Make index lists
        train_idx, test_idx, val_idx = [], [], []
        train_count, test_count, val_count = 0, 0, 0

        #Calculate test/val size
        test_size, val_size = round(len(ready_group)*0.15), round(len(ready_group)*0.15)
        
        for i in ready_group.index.to_list():
            if test_count < test_size:
                test_idx.append(i)
                test_count += 1
            elif val_count < val_size:
                val_idx.append(i)
                val_count += 1
            else:
                train_idx.append(i)
                train_count += 1
        train.append(ready_group[ready_group.index.isin(train_idx)])
        test.append(ready_group[ready_group.index.isin(test_idx)])
        val.append(ready_group[ready_group.index.isin(val_idx)])


In [199]:
train = pd.concat(train)[['Diagnosis', 'Sex', 'Age', 'IQ','Sub ID', 'Dataset']]
train_id = list(zip(train['Sub ID'],train['Dataset']))

val = pd.concat(val)[['Diagnosis', 'Sex', 'Age', 'IQ','Sub ID', 'Dataset']]
val_id = list(zip(val['Sub ID'],val['Dataset']))

test = pd.concat(test)[['Diagnosis', 'Sex', 'Age', 'IQ','Sub ID', 'Dataset']]
test_id = list(zip(test['Sub ID'],test['Dataset']))

In [200]:
nx_files = os.listdir('data.nosync/networks_multi')
if ".DS_Store" in nx_files:
    nx_files.remove(".DS_Store")
nx_files = pd.DataFrame(nx_files, columns=['file'])
nx_files['Sub ID'] = nx_files['file'].apply(lambda x: x.split('_')[0].zfill(7))
nx_files['Dataset'] = nx_files['file'].apply(lambda x: x.split('_')[2])
nx_files['temp'] = list(zip(nx_files['Sub ID'],nx_files['Dataset']))
nx_files['file'] = 'data.nosync/networks_multi/' + nx_files['file']

In [206]:
#save train
train = nx_files[nx_files['temp'].isin(train_id)]
train['file'].to_csv(f'data.nosync/networks_multi/train_set_files.csv', index= False)
train

Unnamed: 0,file,Sub ID,Dataset,temp
0,data.nosync/networks_multi/1320247_run-1_ADHD2...,1320247,ADHD200,"(1320247, ADHD200)"
1,data.nosync/networks_multi/8415034_run-2_ADHD2...,8415034,ADHD200,"(8415034, ADHD200)"
4,data.nosync/networks_multi/3011311_run-2_ADHD2...,3011311,ADHD200,"(3011311, ADHD200)"
5,data.nosync/networks_multi/0010087_run-2_ADHD2...,0010087,ADHD200,"(0010087, ADHD200)"
6,data.nosync/networks_multi/0010030_run-2_ADHD2...,0010030,ADHD200,"(0010030, ADHD200)"
...,...,...,...,...
649,data.nosync/networks_multi/0010115_run-1_ADHD2...,0010115,ADHD200,"(0010115, ADHD200)"
650,data.nosync/networks_multi/0010086_run-2_ADHD2...,0010086,ADHD200,"(0010086, ADHD200)"
652,data.nosync/networks_multi/1127915_run-1_ADHD2...,1127915,ADHD200,"(1127915, ADHD200)"
653,data.nosync/networks_multi/2136051_run-1_ADHD2...,2136051,ADHD200,"(2136051, ADHD200)"


In [207]:
val = nx_files[nx_files['temp'].isin(val_id)]
val['file'].to_csv(f'data.nosync/networks_multi/val_set_files.csv', index= False)
val

Unnamed: 0,file,Sub ID,Dataset,temp
3,data.nosync/networks_multi/0051038_run-1_ABIDE...,0051038,ABIDEI,"(0051038, ABIDEI)"
8,data.nosync/networks_multi/0051047_run-1_ABIDE...,0051047,ABIDEI,"(0051047, ABIDEI)"
23,data.nosync/networks_multi/0050991_run-1_ABIDE...,0050991,ABIDEI,"(0050991, ABIDEI)"
34,data.nosync/networks_multi/0021025_run-1_ADHD2...,0021025,ADHD200,"(0021025, ADHD200)"
40,data.nosync/networks_multi/0050968_run-1_ABIDE...,0050968,ABIDEI,"(0050968, ABIDEI)"
...,...,...,...,...
630,data.nosync/networks_multi/3441455_run-1_ADHD2...,3441455,ADHD200,"(3441455, ADHD200)"
633,data.nosync/networks_multi/1992284_run-1_ADHD2...,1992284,ADHD200,"(1992284, ADHD200)"
648,data.nosync/networks_multi/0050966_run-1_ABIDE...,0050966,ABIDEI,"(0050966, ABIDEI)"
657,data.nosync/networks_multi/0051012_run-1_ABIDE...,0051012,ABIDEI,"(0051012, ABIDEI)"


In [208]:
test = nx_files[nx_files['temp'].isin(test_id)]
test['file'].to_csv(f'data.nosync/networks_multi/test_set_files.csv', index= False)
test

Unnamed: 0,file,Sub ID,Dataset,temp
2,data.nosync/networks_multi/0021003_run-1_ADHD2...,0021003,ADHD200,"(0021003, ADHD200)"
10,data.nosync/networks_multi/0051086_run-1_ABIDE...,0051086,ABIDEI,"(0051086, ABIDEI)"
15,data.nosync/networks_multi/3433846_run-2_ADHD2...,3433846,ADHD200,"(3433846, ADHD200)"
17,data.nosync/networks_multi/2260910_run-1_ADHD2...,2260910,ADHD200,"(2260910, ADHD200)"
19,data.nosync/networks_multi/5971050_run-1_ADHD2...,5971050,ADHD200,"(5971050, ADHD200)"
...,...,...,...,...
609,data.nosync/networks_multi/0021046_run-1_ADHD2...,0021046,ADHD200,"(0021046, ADHD200)"
644,data.nosync/networks_multi/0021002_run-1_ADHD2...,0021002,ADHD200,"(0021002, ADHD200)"
654,data.nosync/networks_multi/0051030_run-1_ABIDE...,0051030,ABIDEI,"(0051030, ABIDEI)"
655,data.nosync/networks_multi/0029229_run-1_ABIDE...,0029229,ABIDEII,"(0029229, ABIDEII)"
