## NNDSP Subject Data

In [8]:
def nndsp_subjectdata(nndsp_fs_dir, nndsp_pheno_file):
    # merge the subjects that we have fs and add subject number
    df_nndsp = pd.DataFrame({'subj_paths' : [x.as_posix() for x in nndsp_fs_dir.glob('sub-*')]})
    df_nndsp = df_nndsp.assign(MASKID = [int(Path(x).name[4:]) for x in df_nndsp.subj_paths])
    
    # merge the subjects we have fs of with their ages
    df_nndsp = pd.merge(df_nndsp, pd.read_csv(nndsp_pheno_file.as_posix()), on='MASKID')

    # rename columns so that we have standardization across datasets
    df_nndsp = df_nndsp.rename(index=str, columns = {'MASKID' : 'subject', 'age_at_scan' : 'age', 'Sex': 'sex'})
    df_nndsp['sex'] = [2 if s == 'Male' else 1 for s in df_nndsp.sex]
    
    df_nndsp = df_nndsp.drop_duplicates(subset='subject', keep='first')
    df_nndsp = df_nndsp.sort_values(by='subject')
    
    return df_nndsp

## HCP Subject Data

In [7]:
def hcp_subjectdata(hcp_fs_dir, hcp_pheno_file, hcp_pheno2_file):
    # merge the subjects that we have fs and add subject number
    df_hcp = pd.DataFrame({'subj_paths' : [x.as_posix() for x in hcp_fs_dir.glob('sub-*')]})
    df_hcp = df_hcp.assign(Subject = [int(Path(x).name[4:]) for x in df_hcp.subj_paths])
    
    # merge the subjects we have fs of with their ages
    df_hcp = pd.merge(df_hcp, pd.read_csv(hcp_pheno_file.as_posix()), on='Subject')
    df_hcp = pd.merge(df_hcp, pd.read_csv(hcp_pheno2_file.as_posix())[['Subject', 'Gender']], on = 'Subject')
    df_hcp['Gender'] = [2 if s == 'M' else 1 for s in df_hcp.Gender]
    
    # rename columns so that we have standardization across datasets
    df_hcp = df_hcp.rename(index=str, columns={'Subject' : 'subject', 'Age_in_Yrs' : 'age', 'Gender': 'sex'})
    df_hcp = df_hcp.drop_duplicates(subset='subject', keep='first')
    df_hcp = df_hcp.sort_values(by='subject')
    return df_hcp

## NKI Subject Data

In [6]:
def nki_subjectdata(nki_fs_dir, nki_pheno_file):
    # merge the subjects that we have fs and add subject number
    df_nki = pd.DataFrame({'subj_paths' : [x.as_posix() for x in nki_fs_dir.glob('sub-*')]})
    df_nki = df_nki.assign(participant_id = [Path(x).name[4:13] for x in df_nki.subj_paths])
    
    # merge the subjects that we have fs with age
    df_nki = pd.merge(df_nki, pd.read_csv(nki_pheno_file.as_posix(), sep='\t'), on='participant_id')
    df_nki['sex'] = [2 if s == 'MALE' else 1 for s in df_nki.sex]
    
    # rename so that we have standardization across datasets
    df_nki = df_nki.rename(index=str, columns={'participant_id' : 'subject', 'age' : 'age'})
    df_nki = df_nki.drop_duplicates(subset='subject', keep='first')
    df_nki = df_nki.sort_values(by='subject')
    return df_nki

## CoRR Subject Data

In [5]:
def corr_subjectdata(corr_fs_dir, corr_pheno_file):
    # merge the subjects that we have fs and add subject number
    df_corr = pd.DataFrame({'subj_paths' : [x.as_posix() for x in corr_fs_dir.glob('sub-*')]})
    df_corr = df_corr.assign(SUBID = [int(Path(x).name[4:11]) for x in df_corr.subj_paths])
    
    # merge subjects with ages 
    df_corr = pd.merge(df_corr, pd.read_csv(corr_pheno_file.as_posix()), on='SUBID')
    
    # rename so that we have standardization across datasets
    df_corr = df_corr.rename(index=str, columns={'SUBID' : 'subject', 'AGE_AT_SCAN_1' : 'age', 'SEX': 'sex'})
    df_corr.sex = [1 if s == '1' else 2 for s in df_corr.sex]
    
    df_corr = df_corr.sort_values(by='subj_paths')
    df_corr = df_corr.drop_duplicates(subset='subject', keep='first')
    df_corr.age = pd.to_numeric(df_corr.age, errors='coerce').fillna(0).astype(np.int64)
    df_corr = df_corr.sort_values(by='subject')
    
    return df_corr

## SALD Subject Data

In [4]:
def sald_subjectdata(sald_fs_dir, sald_pheno_file):
    # complete once we have SALD FS files
    df_sald = pd.DataFrame({'subj_paths': [x.as_posix() for x in sald_fs_dir.glob('sub-*')]})
    df_sald = df_sald.assign(Sub_ID = [int(Path(x).name[4:]) for x in df_sald.subj_paths])
    
    # merge phenotype file with dataframe
    df_sald = pd.merge(df_sald, pd.read_excel(sald_pheno_file.as_posix()), on='Sub_ID')
    
    df_sald = df_sald.rename(index = str, columns = {'Sub_ID': 'subject', 'Age': 'age', 'Sex.1': 'sex'})
    df_sald['sex'] = [1 if s == 1 else 2 for s in df_sald.sex]
    
    df_sald = df_sald.sort_values(by='subject')
    df_sald = df_sald.drop_duplicates(subset='subject', keep='first')
    df_sald.age = pd.to_numeric(df_sald.age, errors='coerce').fillna(0).astype(np.int64)
    df_sald = df_sald.sort_values(by='subject')
    
    return df_sald

## MRIQC Dataframe

In [9]:
def mriqc_df(mriqc_dir, pheno_file, data = None, data_class = None, merge_df = None):
    
    df = pd.read_csv(mriqc_dir.joinpath('T1w.csv').as_posix())
    
    if 'Unnamed: 0' in df.columns:
        print("True")
        del df['Unnamed: 0']
    
    df = df.assign(dataset = [data for x in range(0, len(df))])
    df = df.assign(data_class = [data_class for x in range(0, len(df))])
    df = df.rename(index = str, columns = {'subject_id': 'subject'})
    
    if merge_df is not None:
        df = pd.merge(df, merge_df, on='subject')
    
    return df