In [1]:
import yaml
import numpy as np
import pandas as pd
from pathlib import Path

In [4]:
def scale_name_mapping(yaml_file, col_names, scale_metadata):
    """
    TODO
    """
    # read yaml file with variable name mappings
    with open(yaml_file, 'r') as f:
        scale_name_json = yaml.load(f, Loader=yaml.FullLoader)
    item_dict = scale_name_json['items']

    # some variables have aliases that we have to consider
    items_w_alias = scale_metadata.loc[scale_metadata['Aliases'].notna()]

    # dictionary with primary variable as key, comma-sep aliases as values
    alias_dict = {item['ElementName']:item['Aliases'] for i,item in items_w_alias.iterrows()}

    # map the descriptive variables to each of the alises 
    item_dict = add_aliases_to_item_dict(item_dict, alias_dict)

    # map scale items to their descriptive names    
    rename_dict  = match_items_to_newnames(col_names, item_dict)
    return rename_dict


### Set Up

In [5]:
# Directory Paths
repo_dir   = '/home/ubuntu/Projects/canbind'
clin_dir   = '/home/ubuntu/canbind_upload/Clinical'
yaml_dir   = Path(repo_dir, 'reference/behavior/scale_yamls')

In [6]:
# List of CANBIND scales
scale_list = [
    'ATHF','BMI','BRIAN','CNSVS','DARS','DID','GAD7',
    'IPAQ','MADRS','MINI','PSQI','QIDS','SDS','SHAPS','WHOQOL',
    'BISBAS','BPI','CGI','DEMO','ECRR','HCL32','LEAPS',
    'MEDHIS','NEOFFI','PSYHIS','QLESQ','SEXFX','SPAQ','YMRS'
]


## Read raw scale dataframes
Store in dictionary

In [7]:
# read each behavioral scale
scale_dict = {}
for scale in scale_list: 
    print( f'------- {scale} -------')
    data_list = []
    # data are stored separately for clinical/MDD groups
    for grp in ['Control', 'MDD']:
        scale_dir  = Path(clin_dir, scale, grp)
        csv_list   = list(Path(clin_dir, scale, grp).glob('*csv'))
        if len(csv_list) == 1:
            scale_df = pd.read_csv(csv_list[0])
            data_list.append(scale_df)
    if len(data_list) > 1:
        scale_df = pd.concat(data_list)
    else: 
        scale_df = data_list[0]
    scale_dict[scale] = scale_df



------- ATHF -------
------- BMI -------
------- BRIAN -------
------- CNSVS -------
------- DARS -------
------- DID -------
------- GAD7 -------
------- IPAQ -------
------- MADRS -------
------- MINI -------
------- PSQI -------
------- QIDS -------
------- SDS -------
------- SHAPS -------
------- WHOQOL -------
------- BISBAS -------
------- BPI -------
------- CGI -------
------- DEMO -------
------- ECRR -------
------- HCL32 -------
------- LEAPS -------
------- MEDHIS -------
------- NEOFFI -------
------- PSYHIS -------
------- QLESQ -------
------- SEXFX -------
------- SPAQ -------
------- YMRS -------


In [8]:
# rename columns using descriptive ids in reference yamls

keep_the_same     = ['SUBJLABEL', 'Group', 'EVENTNAME', 'Visitnum']
rename_scale_dict = {}
for scale in scale_dict.keys():
    print( f'------- {scale} -------')
    scale_df  = scale_dict[scale]
    
    # read yaml
    yaml_file = Path(yaml_dir, f'{scale.lower()}.yaml')    
    with open(yaml_file, 'r') as f:
        scale_name_json = yaml.load(f, Loader=yaml.FullLoader)
    # original id to descipritive id 
    item_dict = scale_name_json['items']
    
    # create a name dictionary for column replacement 
    newname_dict = {}
    for item in scale_df.columns:
        if item in item_dict.keys():
            new_name = f'{scale}-{item}-{item_dict[item]}'
            newname_dict[item] = new_name
        elif item in keep_the_same: 
            newname_dict[item] = item
        else:
            new_name = f'{scale}-{item}-IS-{item}'
            newname_dict[item] = new_name

    #save the renamed dataframe
    rename_df = scale_df.copy()
    rename_df.columns = rename_df.columns.map(newname_dict)
    rename_scale_dict[scale] = rename_df
        

------- ATHF -------
------- BMI -------
------- BRIAN -------
------- CNSVS -------
------- DARS -------
------- DID -------
------- GAD7 -------
------- IPAQ -------
------- MADRS -------
------- MINI -------
------- PSQI -------
------- QIDS -------
------- SDS -------
------- SHAPS -------
------- WHOQOL -------
------- BISBAS -------
------- BPI -------
------- CGI -------
------- DEMO -------
------- ECRR -------
------- HCL32 -------
------- LEAPS -------
------- MEDHIS -------
------- NEOFFI -------
------- PSYHIS -------
------- QLESQ -------
------- SEXFX -------
------- SPAQ -------
------- YMRS -------


In [9]:
main_df = rename_scale_dict['DEMO']


In [10]:

main_df = rename_scale_dict['DEMO']
iter_scales =  [x for x in rename_scale_dict.keys() if x not in ['MEDHIS', 'DEMO']]
for scale in iter_scales:
    print(scale)
    cur_df = rename_scale_dict[scale]
    main_df = main_df.merge(cur_df, on=['SUBJLABEL', 'Group', 'EVENTNAME', 'Visitnum'], how='outer')
    print(main_df.shape)
    

ATHF
(323, 53)
BMI
(323, 57)
BRIAN
(323, 85)
CNSVS
(323, 180)
DARS
(323, 212)
DID
(323, 247)
GAD7
(323, 257)
IPAQ
(323, 268)
MADRS
(323, 280)
MINI
(323, 315)
PSQI
(323, 348)
QIDS
(323, 367)
SDS
(323, 374)
SHAPS
(323, 404)
WHOQOL
(323, 438)
BISBAS
(323, 466)
BPI
(323, 482)
CGI
(323, 484)
ECRR
(323, 537)
HCL32
(323, 574)
LEAPS
(323, 588)
NEOFFI
(323, 684)
PSYHIS
(323, 692)
QLESQ
(323, 711)
SEXFX
(323, 727)
SPAQ
(323, 877)
YMRS
(323, 891)


In [13]:
# write combined file
beh_out = '/home/ubuntu/fsx/research/imaging/datasets/CANBIND/organized_raw_data/beh/CANBIND_clinical_baseline.csv'
main_df.to_csv(beh_out, index=None)


In [15]:
main_df.head()

Unnamed: 0,SUBJLABEL,Group,EVENTNAME,Visitnum,DEMO-AGE-IS-Age,DEMO-SEX-IS-Sex,DEMO-EDUC-IS-Education,DEMO-EMPLOY_STATUS-IS-EmployStatus,DEMO-HANDEDNESS-IS-Handedness,DEMO-HSHLD_INCOME-IS-HouseholdIncome,...,YMRS-YMRS_5_IRBLTY-IS-irritability,YMRS-YMRS_6_SPCH_RATE_AND_AMT-IS-speechRate,YMRS-YMRS_7_LANG_OR_THT_DISRDR-IS-languageThoughDisorder,YMRS-YMRS_8_THT_CTNT-IS-thoughtContent,YMRS-YMRS_9_DSRPTV_OR_AGGR_BHVR-IS-disruptiveBehavior,YMRS-YMRS_10_APRNC-IS-appearance,YMRS-YMRS_11_INSGT-IS-insight,YMRS-YMRS_OVERL_SEVTY-TS-YMRS_TotalScore,YMRS-YMRS_IRTBLTY-SS-YMRS_irritability,YMRS-comp_ymrs-IS-comp_ymrs
0,CBN01_CAM_0002,Control,Baseline,1,26.0,2.0,14.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,CBN01_CAM_0004,Control,Baseline,1,29.0,2.0,22.0,6.0,2.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,CBN01_CAM_0006,Control,Baseline,1,23.0,2.0,19.0,6.0,2.0,9999.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,CBN01_CAM_0015,Control,Baseline,1,33.0,1.0,19.0,6.0,2.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,CBN01_CAM_0016,Control,Baseline,1,55.0,2.0,17.0,1.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [17]:
list(main_df.columns)

['SUBJLABEL',
 'Group',
 'EVENTNAME',
 'Visitnum',
 'DEMO-AGE-IS-Age',
 'DEMO-SEX-IS-Sex',
 'DEMO-EDUC-IS-Education',
 'DEMO-EMPLOY_STATUS-IS-EmployStatus',
 'DEMO-HANDEDNESS-IS-Handedness',
 'DEMO-HSHLD_INCOME-IS-HouseholdIncome',
 'DEMO-JOB_CLASS-IS-JobCategory',
 'DEMO-MRTL_STATUS-IS-MaritalStatus',
 'DEMO-ethncty_cdn___1-IS-Ethnic_Aboriginal',
 'DEMO-ethncty_cdn___2-IS-Ethnic_Arab',
 'DEMO-ethncty_cdn___3-IS-Ethnic_Black',
 'DEMO-ethncty_cdn___4-IS-Ethnic_Chinese',
 'DEMO-ethncty_cdn___5-IS-Ethnic_EastAsian',
 'DEMO-ethncty_cdn___6-IS-Ethnic_Filipino',
 'DEMO-ethncty_cdn___7-IS-Ethnic_Japanese',
 'DEMO-ethncty_cdn___8-IS-Ethnic_Jewish',
 'DEMO-ethncty_cdn___9-IS-Ethnic_Korean',
 'DEMO-ethncty_cdn___10-IS-Ethnic_LatinAmHispanic',
 'DEMO-ethncty_cdn___11-IS-Ethnic_SouthAsian',
 'DEMO-ethncty_cdn___12-IS-Ethnic_SoutheastAsian',
 'DEMO-ethncty_cdn___13-IS-Ethnic_WestAsian',
 'DEMO-ethncty_cdn___14-IS-Ethnic_White',
 'DEMO-ethncty_cdn___9996-IS-Ethnic_Other',
 'DEMO-ethncty_cdn___9998-I