In [1]:
import os
import glob
import seaborn as sns
import re
import pandas as pd
import plotly.express as px
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

# hbn-specific libraries - make sure you have installed (pipenv install) and activated (pipenv shell) 
# the virtual environment for this project, and make sure you have created an ipykernel for this environment (ipython kernel install --name "hbn" --user)
from hbn.constants import Defaults
from hbn.scripts import preprocess_phenotype, make_phenotype_specs
from hbn.data import make_dataset
from hbn.features import build_features
from hbn.features.feature_selection import phenotype_features

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [2]:
# RUN THIS CELL
# FUNCTIONS

def load_data(
    participants, 
    feature_spec, 
    cols_to_keep=['DX_01', 'DX_01_Cat', 'Age', 'Sex', 'Identifiers']
    ):
    # get data
    df = phenotype_features(target_spec=None,
                            feature_spec=os.path.join(Defaults.FEATURE_DIR, feature_spec),
                            participants=participants,
                            preprocess=False,
                            drop_identifiers=False
                            )
    
    # get summary of clinical diagnosis + other demographics
    dx = make_dataset.make_summary(save=False)
    dx = make_dataset._add_race_ethnicity(dataframe=dx)

    # get data from intake interview and merge with clinical summary
    df = df.merge(dx[cols_to_keep], on='Identifiers')
    
    return df

def make_dataframe(
    feature_spec,
    participants,
    cols_to_keep='Identifiers|DX_01|Age|Sex|DX_01_Cat',
    filter_scores=True,
    ):

    # get data
    list_of_cols = list(cols_to_keep.split('|'))
    df = load_data(participants, 
                 feature_spec=feature_spec,
                 cols_to_keep=list_of_cols
                 )
    
    # get assessment, domain, measure names
    feature_spec_split = Path(feature_spec).name.split('-')
    assessment, domain, measure = feature_spec_split[1], feature_spec_split[2], '-'.join(feature_spec_split[3:-1])
    
    df_all = pd.DataFrame()
    if len(df.columns) > len(list_of_cols): 
        
        # get abbrev from dataframe based on default columns
        dict_present = False
        abbrev = df.filter(regex='Site|EID|START_DATE|Data_entry|Year').columns[0].split(',')[0]
        fpath = os.path.join(Defaults.PHENO_DIR, 'Release9_DataDic', abbrev + '.xlsx')
        if os.path.isfile(fpath):
            dict_df = pd.read_excel(fpath, header=1)
            dict_present = True
        
        # only process data that have accompanying dictionaires
        if dict_present:

            # remove prefix from variable values - always second column in data dic
            dict_df.rename(columns={dict_df.columns[0]: "Question", dict_df.columns[1]: "Variable"}, inplace=True)

            # filter dataframe on certain columns and regex patterns
            df_filter = df.filter(regex=f'{abbrev}|{cols_to_keep}')

            # loop over diagnosis groups and melt `T_scores` column into one
            # concat each group to one dataframe
            for name, group in df_filter.groupby('DX_01'):
                if filter_scores:
                    scores_to_filter = '_T|_Stnd|_Sum|_Score|_Scale|_Standard|_IN|_HY'
                    group = group.filter(regex=f'{scores_to_filter}|{cols_to_keep}')
                tmp = group.melt(id_vars=list(cols_to_keep.split('|'))).rename({'variable':'Name', 'value': 'Scores'}, axis=1)
                tmp['Name'] = tmp['Name'].str.replace(f'{abbrev},','')
                tmp = tmp.merge(dict_df[['Question', 'Variable']], left_on=['Name'], right_on=['Variable'])
                tmp['Assessment'], tmp['domain'], tmp['measure'] = assessment, domain, measure
                df_all = pd.concat([tmp, df_all])

            # do some clean up on existing columns
            df_all['Age_rounded'] = df_all['Age'].round()
            df_all['Question'] = df_all['Question'].str.replace("T Score", "T-Score")
        
    
    return df_all.reset_index(drop=True)


def get_data(domain):
    
    participants = make_dataset.get_participants(
                                split='all', 
                                disorders=['ADHD-Combined Type', 
                                            'ADHD-Inattentive Type', 
                                            'ADHD-Hyperactive_Impulsive_Type', 
                                            'Other_Specified_Attention-Deficit_Hyperactivity_Disorder',
                                            'No_Diagnosis_Given']
                                            )
    # loop over domains
    feature_specs = glob.glob(os.path.join(Defaults.FEATURE_DIR, f'*{domain}*'))

    # loop over feature specs 
    df_all = pd.DataFrame()
    for feature_spec in feature_specs:
        df = make_dataframe(
                            feature_spec=feature_spec,
                            participants=participants,
                            cols_to_keep='Identifiers|DX_01|Age|Sex|DX_01_Cat',
                            filter_scores=True,
                            )
        df_all = pd.concat([df_all, df])
        print(feature_spec)
    
    return df_all


In [3]:
## RUN THIS CELL ##

# Preprocess data
#preprocess_phenotype.run()

# get specs
#make_phenotype_specs.run()

In [4]:
assessment = 'Parent Measures'

domains = build_features.get_domains(assessment=assessment)
domains[assessment].remove('all')
domains = [os.path.join('_'.join(re.split(r'_|,|/| ', d))) for d in domains[assessment]]
print(domains)

['Interview_of_Emotional_and_Psychological_Function', 'Demographic_Questionnaire_Measures', 'Questionnaire_Measures_of_Family_structure__Stress__and_Trauma']


## Questionnaire Measures of Family structure Stress and Trauma

In [5]:
df = get_data(domain='Questionnaire_Measures_of_Family_structure_Stress_and_Trauma')

#for name, group in df.groupby(['Question']):
    
 #   measure = group['measure'].unique()[0]
    
  #  fig = sns.barplot(data=group, x='Question', y='Scores', hue='DX_01')
  #  plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
   # plt.xticks([]);
   # plt.xlabel('')
   # plt.title(f'{measure}-{name}')
   # plt.show()

for name, group in df.groupby(['Question']):
    
    measure = group['measure'].unique()[0]

    tmp = group.groupby(['DX_01', 'Sex']).mean().reset_index()

    fig = px.line_polar(tmp, r="Scores", 
                        theta="DX_01", 
                        color="Sex", 
                        line_close=True, 
                        title=f'{measure}-{name}',
                        range_r=[tmp['Scores'].min(), tmp['Scores'].max()]
                       ) 
    fig.show()

/Users/maedbhking/Documents/healthy_brain_network/features/features-Parent_Measures-Questionnaire_Measures_of_Family_structure_Stress_and_Trauma-PhenX_Neighborhood_Safety-spec.json


/Users/maedbhking/Documents/healthy_brain_network/features/features-Parent_Measures-Questionnaire_Measures_of_Family_structure_Stress_and_Trauma-Parenting_Stress_Index_Fourth_Edition-spec.json


/Users/maedbhking/Documents/healthy_brain_network/features/features-Parent_Measures-Questionnaire_Measures_of_Family_structure_Stress_and_Trauma-Distress_Tolerance_Scale-spec.json


/Users/maedbhking/Documents/healthy_brain_network/features/features-Parent_Measures-Questionnaire_Measures_of_Family_structure_Stress_and_Trauma-Parent-Child_Internet_Addiction_Test-spec.json


/Users/maedbhking/Documents/healthy_brain_network/features/features-Parent_Measures-Questionnaire_Measures_of_Family_structure_Stress_and_Trauma-Alabama_Parenting_Questionnaire_–_Parent_Report-spec.json
