In [1]:
import os
import glob
import seaborn as sns
import re
import pandas as pd
import plotly.express as px
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

# hbn-specific libraries - make sure you have installed and activated the conda environment (conda activate healthy-brain-network) 
# the virtual environment for this project, and make sure you have created an ipykernel for this environment (ipython kernel install --name "hbn" --user)
from hbn.constants import Defaults
from hbn.data import make_dataset
from hbn.features import build_features
from hbn.features.feature_selection import phenotype_features

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')


The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [None]:
## PREPROCESS DATA - RUN THIS CELL ##
#from hbn.scripts import preprocess_phenotype, make_phenotype_specs

# Preprocess data
#preprocess_phenotype.run()

# get specs
#make_phenotype_specs.run()

In [3]:
# RUN THIS CELL
# FUNCTIONS

def load_data(
    participants, 
    feature_spec, 
    cols_to_keep=['DX_01', 'DX_01_Cat', 'Age', 'Sex', 'Identifiers']
    ):
    # get data from `feature_spec`
    df = phenotype_features(feature_spec=os.path.join(Defaults.FEATURE_DIR, feature_spec),
                            participants=participants,
                            target_spec=None,
                            preprocess=False,
                            drop_identifiers=False,
                            oversample=False
                            )
    
    # get summary of clinical diagnosis + other demographics
    dx = make_dataset.make_summary(save=False)

    # get data from `feature_spec` and merge with clinical summary
    df = df.merge(dx[cols_to_keep], on='Identifiers')
    
    return df

def make_dataframe(
    features,
    filter_scores=True,
    ):
    """make dataframe out of scores from features dataframe
        features (pd dataframe): output from `load_data`
        filter_scores (bool): default is True. filter scores to only return overall/standard/scaled scores
    """
    
    df_all = pd.DataFrame()
    if len(features.columns) > len(list_of_cols): 
        
        # get abbrev from dataframe based on default columns
        dict_present = False
        abbrev = features.filter(regex='Site|EID|START_DATE|Data_entry|Year').columns[0].split(',')[0]
        fpath = os.path.join(Defaults.PHENO_DIR, 'Release9_DataDic', abbrev + '.xlsx')
        if os.path.isfile(fpath):
            dict_df = pd.read_excel(fpath, header=1)
            dict_present = True
        
        # only process data that have accompanying dictionaires
        if dict_present:

            # remove prefix from variable values - always second column in data dic
            dict_df.rename(columns={dict_df.columns[0]: "Question", dict_df.columns[1]: "Variable"}, inplace=True)

            # filter dataframe on certain columns and regex patterns
            df_filter = features.filter(regex=f'{abbrev}|{cols_to_keep}')

            # loop over diagnosis groups and melt `T_scores` column into one
            # concat each group to one dataframe
            for name, group in df_filter.groupby('DX_01'):
                if filter_scores:
                    scores_to_filter = '_T|_Stnd|_Sum|_Score|_Scale|_Standard|_IN|_HY'
                    group = group.filter(regex=f'{scores_to_filter}|{cols_to_keep}')
                tmp = group.melt(id_vars=list(cols_to_keep.split('|'))).rename({'variable':'Name', 'value': 'Scores'}, axis=1)
                tmp['Name'] = tmp['Name'].str.replace(f'{abbrev},','')
                tmp = tmp.merge(dict_df[['Question', 'Variable']], left_on=['Name'], right_on=['Variable'])
                tmp['Assessment'], tmp['domain'], tmp['measure'], tmp['abbrev'] = assessment, domain, measure, mabbrev
                df_all = pd.concat([tmp, df_all])

            # do some clean up on existing columns
            df_all['Age_rounded'] = df_all['Age'].round()
            df_all['Question'] = df_all['Question'].str.replace("T Score", "T-Score")
        
    
    return df_all.reset_index(drop=True)


def get_data(domain):
    """get data corresponding to `domain`
    
    Returns:
        df_all (pd dataframe):
    """
    from hbn import io
    
    participants = make_dataset.get_participants(
                                split='all', 
                                disorders=['ADHD-Combined_Type', 
                                            'ADHD-Inattentive_Type', 
                                            'ADHD-Hyperactive_Impulsive_Type', 
                                            'Other_Specified_Attention-Deficit_Hyperactivity_Disorder',
                                            'No_Diagnosis_Given']
                                            )
    # loop over domains
    feature_specs = glob.glob(os.path.join(Defaults.FEATURE_DIR, f'*{domain}*'))
    
    # loop over feature specs 
    df_all = pd.DataFrame()
    for feature_spec in feature_specs:
        
        # get data for `feature_spec`
        features = load_data(
                         participants, 
                         feature_spec=feature_spec,
                         cols_to_keep=['Identifiers', 'DX_01', 'Age', 'Sex', 'DX_01_Cat']
                         )
        # get assessment, domain, measure names
        spec_info = io.read_json(feature_spec)
        features['assessment'], features['domain'], features['measure'], features['mabbrev'], features['datadic'] = spec_info['assessment'], spec_info['domains'], spec_info['measures'], spec_info['abbrevs'], spec_info['datadic']

        # make dataframe out of features
        df = make_dataframe(
                            features=features,
                            filter_scores=True,
                            )
        df_all = pd.concat([df_all, df])
        print(feature_spec)
    
    return df_all


In [69]:
# cols to delete
str_to_delete = 'Administration|Data_entry|Days_Baseline|EID|Incomplete_Reason|Complete|START_DATE|Season|Site|Study|Year'
cols_to_delete = group.filter(regex=f'{str_to_delete}').columns

# cols to keep
cols_to_keep = 'Identifiers|DX_01|Age|Sex|DX_01_Cat|_T|_Stnd|_Sum|_Score|_Scale|_Standard|_IN|_HY'

df_filter = df.filter(regex=f'{cols_to_keep}').drop(cols_to_delete, axis=1)

#for name, group in df_filter.groupby('DX_01'):


In [79]:
df_filter


Unnamed: 0,Identifiers,"NIH_Scores,NIH7_Card","NIH_Scores,NIH7_Card_P","NIH_Scores,NIH7_Comp","NIH_Scores,NIH7_Comp_P","NIH_Scores,NIH7_Flanker","NIH_Scores,NIH7_Flanker_P","NIH_Scores,NIH7_List","NIH_Scores,NIH7_List_P","NIH_Scores,NIH7_Pattern","NIH_Scores,NIH7_Pattern_P","NIH_Scores,NIH7_Picture","NIH_Scores,NIH7_Picture_P",DX_01,Age,Sex,DX_01_Cat
0,NDARAA075AMK,132.25,98.4,,,126.7,96.3,,,,,,,No Diagnosis Given,6.728040,female,No Diagnosis Given
1,NDARAA112DMH,146.00,100.0,,,135.0,99.0,109.0,73.0,123.0,94.0,,,ADHD-Combined Type,5.545744,male,Neurodevelopmental Disorders
2,NDARAA117NEJ,93.00,32.0,,,90.0,25.0,91.0,27.0,73.0,4.0,,,ADHD-Combined Type,7.475929,male,Neurodevelopmental Disorders
3,NDARAA504CRN,100.00,50.0,,,78.0,7.0,82.0,12.0,65.0,1.0,,,ADHD-Inattentive Type,9.165297,female,Neurodevelopmental Disorders
4,NDARAA536PTU,,,,,,,,,,,,,ADHD-Inattentive Type,11.998402,male,Neurodevelopmental Disorders
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,NDARZY668NMV,73.00,4.0,,,71.0,3.0,77.0,6.0,44.0,0.0,,,ADHD-Combined Type,11.623431,female,Neurodevelopmental Disorders
2274,NDARZZ046HJA,109.00,73.0,,,88.0,21.0,89.0,23.0,117.0,87.0,,,ADHD-Inattentive Type,9.372689,male,Neurodevelopmental Disorders
2275,NDARZZ598MH8,87.00,19.0,,,73.0,4.0,122.0,93.0,54.0,1.0,,,ADHD-Inattentive Type,8.578713,male,Neurodevelopmental Disorders
2276,NDARZZ740MLM,90.00,25.0,,,100.0,50.0,68.0,2.0,98.0,45.0,,,ADHD-Inattentive Type,6.638261,male,Neurodevelopmental Disorders


In [29]:
# get data
from hbn import io

list_of_cols = ['Identifiers', 'DX_01', 'Age', 'Sex', 'DX_01_Cat']

participants = make_dataset.get_participants(
                            split='all', 
                            disorders=['ADHD-Combined_Type', 
                                        'ADHD-Inattentive_Type', 
                                        'ADHD-Hyperactive_Impulsive_Type', 
                                        'Other_Specified_Attention-Deficit_Hyperactivity_Disorder',
                                        'No_Diagnosis_Given']
                                        )
# loop over domains
domain='Cognitive_Testing'
feature_specs = glob.glob(os.path.join(Defaults.FEATURE_DIR, f'*{domain}*'))

# loop over feature specs 
df_all = pd.DataFrame()
for feature_spec in feature_specs:

    # get data for `feature_spec`
    df = load_data(participants, 
                 feature_spec=feature_spec,
                 cols_to_keep=list_of_cols
                 )

# get assessment, domain, measure names
spec_info = io.read_json(feature_spec)
df['assessment'], df['domain'], df['measure'], df['mabbrev'], df['datadic'] = spec_info['assessment'], spec_info['domains'], spec_info['measures'], spec_info['abbrevs'], spec_info['datadic']



In [7]:
df = get_data(domain='Cognitive_Testing')

/Users/maedbhking/Documents/healthy_brain_network/features/features-Child_Measures-Cognitive_Testing-Adaptive_Cognitive_Evaluation-ACE_Spatial_Span_B-spec.json
/Users/maedbhking/Documents/healthy_brain_network/features/features-Child_Measures-Cognitive_Testing-Wechsler_Adult_Intelligence_Scale-IV-WAIS_Abb-spec.json
/Users/maedbhking/Documents/healthy_brain_network/features/features-Child_Measures-Cognitive_Testing-Temporal_Discounting_Task-temp_disc_final-spec.json
/Users/maedbhking/Documents/healthy_brain_network/features/features-Child_Measures-Cognitive_Testing-Kaufman_Brief_Intelligence_Test-II-KBIT-spec.json
/Users/maedbhking/Documents/healthy_brain_network/features/features-Child_Measures-Cognitive_Testing-Wechsler_Individual_Achievement_Test_-_III-WIAT-spec.json
/Users/maedbhking/Documents/healthy_brain_network/features/features-Child_Measures-Cognitive_Testing-NIH_Toolbox-NIH_final-spec.json
/Users/maedbhking/Documents/healthy_brain_network/features/features-Child_Measures-Cogn