In [2]:
import yaml
import numpy as np
import pandas as pd
from pathlib import Path

In [4]:
def scale_name_mapping(yaml_file, col_names, scale_metadata):
    """
    TODO
    """
    # read yaml file with variable name mappings
    with open(yaml_file, 'r') as f:
        scale_name_json = yaml.load(f, Loader=yaml.FullLoader)
    item_dict = scale_name_json['items']

    # some variables have aliases that we have to consider
    items_w_alias = scale_metadata.loc[scale_metadata['Aliases'].notna()]

    # dictionary with primary variable as key, comma-sep aliases as values
    alias_dict = {item['ElementName']:item['Aliases'] for i,item in items_w_alias.iterrows()}

    # map the descriptive variables to each of the alises 
    item_dict = add_aliases_to_item_dict(item_dict, alias_dict)

    # map scale items to their descriptive names    
    rename_dict  = match_items_to_newnames(col_names, item_dict)
    return rename_dict


In [5]:
# Set up directories
#base_dir   = '/home/ubuntu/canbind-fsx'
repo_dir   = '/home/ec2-user/SageMaker/suhas/canbind'
clin_dir   = '/home/ec2-user/SageMaker/ebs/fsx/Clinical'
yaml_dir   = Path(repo_dir, 'scripts/reference/behavior/scale_yamls')


In [6]:
scale_list = [
    'ATHF','BMI','BRIAN','CNSVS','DARS','DID','GAD7',
    'IPAQ','MADRS','MINI','PSQI','QIDS','SDS','SHAPS','WHOQOL',
    'BISBAS','BPI','CGI','DEMO','ECRR','HCL32','LEAPS',
    'MEDHIS','NEOFFI','PSYHIS','QLESQ','SEXFX','SPAQ','YMRS'
]


In [8]:

#sessions_dir = Path(base_dir, 'research/imaging/datasets/SRPBS/processed_data/pf-pipelines/qunex-nbridge/studies/CANBIND-20220818-mCcU5pi4/sessions')

# read each behavioral scale
scale_dict = {}
for scale in scale_list: 
    data_list = []
    for grp in ['Control', 'MDD']:
        scale_dir  = Path(clin_dir, scale, grp)
        csv_list   = list(Path(clin_dir, scale, grp).glob('*csv'))
        if len(csv_list) == 1:
            scale_df = pd.read_csv(csv_list[0])
            data_list.append(scale_df)
    if len(data_list) > 1:
        scale_df = pd.concat(data_list)
    else: 
        scale_df = data_list[0]
    scale_dict[scale] = scale_df



In [9]:
# rename columns using descriptive ids in reference yamls
rename_scale_dict = {}
for scale in scale_dict.keys():
    print( f'------- {scale} -------')
    scale_df  = scale_dict[scale]
    
    # read yaml
    yaml_file = Path(yaml_dir, f'{scale.lower()}.yaml')    
    with open(yaml_file, 'r') as f:
        scale_name_json = yaml.load(f, Loader=yaml.FullLoader)
    # original id to descipritive id 
    item_dict = scale_name_json['items']
    
    # create a name dictionary for column replacement 
    newname_dict = {}
    for item in scale_df.columns:
        if item in item_dict.keys():
            new_name = f'{scale}-{item}-{item_dict[item]}'
            newname_dict[item] = new_name
        else:
            newname_dict[item] = item
    
    #save the renamed dataframe
    rename_df = scale_df.copy()
    rename_df.columns = rename_df.columns.map(newname_dict)
    rename_scale_dict[scale] = rename_df
        

------- ATHF -------
------- BMI -------
------- BRIAN -------
------- CNSVS -------
------- DARS -------
------- DID -------
------- GAD7 -------
------- IPAQ -------
------- MADRS -------
------- MINI -------
------- PSQI -------
------- QIDS -------
------- SDS -------
------- SHAPS -------
------- WHOQOL -------
------- BISBAS -------
------- BPI -------
------- CGI -------
------- DEMO -------
------- ECRR -------
------- HCL32 -------
------- LEAPS -------
------- MEDHIS -------
------- NEOFFI -------
------- PSYHIS -------
------- QLESQ -------
------- SEXFX -------
------- SPAQ -------
------- YMRS -------


In [13]:
scale = 'DARS'

keep_the_same = ['SUBJLABEL', 'Group', 'EVENTNAME', 'Visitnum']
print( f'------- {scale} -------')
scale_df  = scale_dict[scale]

# read yaml
yaml_file = Path(yaml_dir, f'{scale.lower()}.yaml')    
with open(yaml_file, 'r') as f:
    scale_name_json = yaml.load(f, Loader=yaml.FullLoader)
# original id to descipritive id 
item_dict = scale_name_json['items']

# create a name dictionary for column replacement 
newname_dict = {}
for item in scale_df.columns:
    if item in item_dict.keys():
        new_name = f'{scale}-{item}-{item_dict[item]}'
        newname_dict[item] = new_name
    elif item in keep_the_same: 
        newname_dict[item] = item
    else:
        new_name = f'{scale}-{item}-{item}'

#save the renamed dataframe
rename_df = scale_df.copy()
rename_df.columns = rename_df.columns.map(newname_dict)
rename_scale_dict[scale] = rename_df

scale_df.columns

------- DARS -------


Index(['SUBJLABEL', 'Group', 'EVENTNAME', 'Visitnum', 'DARS_B_1', 'DARS_B_2',
       'DARS_B_3', 'DARS_B_4', 'DARS_B_5', 'DARS_B_6', 'DARS_B_7', 'DARS_B_8',
       'DARS_B_9', 'DARS_Hobbies_Tot', 'DARS_D_10', 'DARS_D_11', 'DARS_D_12',
       'DARS_D_13', 'DARS_D_14', 'DARS_D_15', 'DARS_Food_Tot', 'DARS_F_16',
       'DARS_F_17', 'DARS_F_18', 'DARS_F_19', 'DARS_F_20', 'DARS_F_21',
       'DARS_Social_Tot', 'DARS_H_22', 'DARS_H_23', 'DARS_H_24', 'DARS_H_25',
       'DARS_H_26', 'DARS_Sense_Tot', 'DARS_Tot', 'comp_dars'],
      dtype='object')

In [10]:
medhis_df = rename_scale_dict['MEDHIS']

main_df = rename_scale_dict['DEMO']
iter_scales =  [x for x in rename_scale_dict.keys() if x not in ['MEDHIS', 'DEMO']]
for scale in iter_scales:
    print(scale)
    cur_df = rename_scale_dict[scale]
    main_df = main_df.merge(cur_df, on=['SUBJLABEL', 'Group', 'EVENTNAME', 'Visitnum'], how='outer')
    print(main_df.shape)
    

ATHF
(323, 53)
BMI
(323, 57)
BRIAN
(323, 85)
CNSVS
(323, 180)
DARS
(323, 212)
DID
(323, 247)
GAD7
(323, 257)
IPAQ
(323, 268)
MADRS
(323, 280)
MINI
(323, 315)
PSQI
(323, 348)
QIDS
(323, 367)
SDS
(323, 374)
SHAPS
(323, 404)
WHOQOL
(323, 438)
BISBAS
(323, 466)
BPI
(323, 482)
CGI
(323, 484)
ECRR
(323, 537)
HCL32
(323, 574)
LEAPS
(323, 588)
NEOFFI
(323, 684)
PSYHIS
(323, 692)
QLESQ
(323, 711)
SEXFX
(323, 727)
SPAQ
(323, 877)
YMRS
(323, 891)
