# Main script - health data

## Capstone project


Author: Cornelia Ilin

Email: cilin@wisc.edu

Date created: Oct 14, 2022

### Step 1: Import packages

In [1]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings("ignore")

### Step 2: Define working directories

In [2]:
in_dir = 'C:/Users/cilin/Research/CA_hospitals_capstone/data/'
in_dir_h = 'C:/Users/cilin/Research/CA_hospitals/Input/final_data/health/'

out_dir = 'C:/Users/cilin/Research/CA_hospitals_capstone/output/'

### Step 3: Define functions

In [3]:
def read_health_data():
    '''
    '''
    df_dict = {}
    for file in os.listdir(in_dir_h):
        if file in ['PDD_final.csv', 'EDD_final.csv']:
            key = file.split('_')[0]
            print('Reading:', file)
            df_dict[key] = pd.read_csv(in_dir_h + file)
        
    return df_dict

In [4]:
def add_outcome(df):
    """ A function that adds an outcome variable (=1 if diagnosis (diag00-diag04) is for respiratory/circulatory/injury; = 0 if for injury/poisoning) 
    """
    outcome_is_1 = add_outcome_helper()
        
    df["outcome"] = np.where((df.diag00.str.startswith(outcome_is_1)| df.diag01.str.startswith(outcome_is_1)), 1,
                                      np.where(df.diag00.str.startswith(("8", "9", "S", "T")), 0, 'others'))
        
    return df

In [5]:
def add_outcome_helper():
    """ A function that creates a list of ICD9/10 diagnosis codes for which outcome variable == 1, 
        (i.e. diagnosis is related to respiratory or circultory health condition)
    # return: a list of diagnosis codes
    """
    # read icd codes selection    
    icd = pd.read_excel(
        in_dir + 'data_selection.xlsx',
        'Diag_codes', skiprows = 2, header = 0
    )
        
    # keep only if it's to be used in the analysis
    icd = icd[icd["Use to define outcome variable [behrt]"] == 1]

    
    ## Step1: extract ICD-9-CM codes for which outcome == 1:
    icd9_values = [] 

    # grab raw values
    raw_values = icd["ICD-9-CM"].values.tolist()

    # create array sequences based on raw values
    seq_values = []
    for val in raw_values:
        if val == -1:
            continue
        else:
            seq_values.append(np.arange(int(val[0:3]), int(val[4:7])+1, 1))

    # save final values
    for row in seq_values:
        for cell in row:
            icd9_values.append(str(cell))

    # create tuple
    icd9_values = tuple(icd9_values)
    
    ## Step2: extract ICD-10-CM codes for which outcome == 1:
    icd10_values = [] 

    # grab raw values
    raw_values = icd["ICD-10-CM"].values.tolist()
    raw_values = [val.lstrip() for val in raw_values]

    # create array sequences based on raw values
    seq_values = []
    seq_letters = []
    
    for val in raw_values:
        if val == "J00-J06, J20-J22":
            seq_values.append(np.arange(int(val[1:3]), int(val[5:7])+1, 1))
            seq_values.append(np.arange(int(val[10:12]), int(val[14:16])+1, 1))
            seq_letters.append(val[0])
            seq_letters.append(val[9]) 
        else: 
            seq_values.append(np.arange(int(val[1:3]), int(val[5:7])+1, 1))
            seq_letters.append(val[0])

    # save final values
    for index, row in enumerate(seq_values):
        for cell in row:
            if cell < 10: # add a 0 to numbers that have only one character
                icd10_values.append(seq_letters[index] + "0" + str(cell))
            else:
                icd10_values.append(seq_letters[index] + str(cell))

    # create tuple
    icd10_values = tuple(icd10_values)
    
    ## Step3: Combine Step1 and Step2
    icd_values = icd9_values + icd10_values
    
    return icd_values

### Step 4: Read data

``school zip-year-month keys``

In [None]:
df = pd.read_csv(in_dir + 'keys.csv')
df.head()

``health data``

In [None]:
dict_h = read_health_data()
print('Keys in data:', dict_h.keys())

### Step 5: Preprocess data

``school zip-year-month keys``

In [None]:
# create ziph_year_month
df['zips'] = df.school_zips.values
df['year'] = df.year_month.str[:4]
df['month'] = df.year_month.str[5:].astype(str)
df['month'] = np.where(df.month.str.startswith('0'), df.month.str[1:], df.month)
df.head(20)

In [None]:
# create ziph_year_month
df['school_zips'] = np.where(df.school_zips.isna(), 'nan', df.school_zips)
df['year'] = df.year_month.str[:4]
df['month'] = df.year_month.str[5:]
df['month'] = np.where(df.month.str.startswith('0'), df.month.str[1:], df.month)
df['year_month'] = df.year + '-' + df.month
df['year_month'] = np.where(df.year_month.isna(), 'nan', df.year_month)

zips = df.school_zips.values
year_month = df.year_month.values
ziph_year_month = []
for z in zips:
    for val in year_month:
        if z=='nan' or val=='nan':
            continue
        else:
            ziph_year_month.append(str(z) + '-' + str(val))
print('Example data points:')
ziph_year_month[:3]

In [None]:
# pull out icd codes of interest
df['icd_code_2'] = df.icd_code.str.replace('.', '')

# add to dictionary
diseases = {}
for val in ['hematopoietic_cancers', 'type_1_diabetes', 'pediatric_vasculitis']:
    diseases[val] = list(df[df.disease_grp_name.eq(val)].icd_code_2.values)


In [None]:
diseases['type_1_diabetes']

``health data``

In [None]:
# print shapes
for key in ['PDD', 'EDD']:
    print(key)
    display(dict_h[key].shape)
    print('---')

In [None]:
# transform cols to strings
for key in ['PDD', 'EDD']:
    for col in ['bthyear', 'ZCTA10P', 'patzip', 'admtyear', 'admtmonth', 'diag00', 'diag01']:
        dict_h[key][col] = dict_h[key][col].astype(str)
    
    for col in ['patzip', 'ZCTA10P', 'admtyear', 'admtmonth']:
        dict_h[key][col] = dict_h[key][col].str.replace('.0', '')

In [None]:
# create key for ZCTA10P_year_month
for key in ['PDD', 'EDD']:
    dict_h[key]['patzip_year_month'] = dict_h[key].patzip + '-' + dict_h[key].admtdate.str[:4] + '-' + dict_h[key].admtmonth
    dict_h[key]['ZCTA10P_year_month'] = dict_h[key].ZCTA10P + '-' + dict_h[key].admtdate.str[:4] + '-' + dict_h[key].admtmonth

In [None]:
# keep only patients born after 2000
for key in ['PDD', 'EDD']:
    dict_h[key] = dict_h[key][dict_h[key].bthyear.ge('2000')]
    # print shapes
    print(key)
    display(dict_h[key].shape)
    print('---')

In [None]:
# keep only if key in schools data
for key in ['PDD', 'EDD']:
    dict_h[key] = dict_h[key][dict_h[key].patzip_year_month.isin(ziph_year_month)]
    # print shapes
    print(key)
    display(dict_h[key].shape)
    print('---')

In [None]:
# concatenate PDD end EDD
df_all = pd.concat([dict_h['PDD'], dict_h['EDD']], axis=0)
print('Shape of concat data')
display(df_all.shape)
df_all.head()

In [None]:
# add resp/cardio and injury/accidents disease codes
df_all = add_outcome(df_all)
df_all.head()

In [None]:
df_all.outcome.unique()

### Step 6: Compute admission numbers for each disease

In [None]:
# create dummy if visit is for given disease
for key in list(diseases.keys()):
    print('create dummies for: ', key)
    df_all[key] = np.where((df_all.diag00.str.startswith(tuple((diseases[key]))) | df_all.diag01.str.startswith(tuple((diseases[key])))), 1, 0)

for idx, key in enumerate(['injuries_accidents', 'resp_cardio']):
    print('create dummies for: ', key)
    df_all[key] = np.where(df_all.outcome.eq(str(idx)), 1, 0)
    
df_all.head()

In [None]:
df_all.resp_cardio.describe()

In [None]:
%matplotlib inline
import seaborn as sns
colors =  sns.color_palette("rocket_r", 5)

# compute visits by patzip_year_month
for idx, key in enumerate(list(diseases.keys()) + ['injuries_accidents', 'resp_cardio']):
    print(key)
    temp = df_all.groupby(['patzip_year_month', key], as_index=False).ZCTA10P.count()
    temp.rename(columns={'ZCTA10P': 'number_of_visits'}, inplace=True)
    # disturn reality (Add noise)
    if idx in [0, 1, 2]:
        temp['number_of_visits'] = temp.number_of_visits + 16
    else:
        temp['number_of_visits'] = temp.number_of_visits + 111
    temp = temp[temp[key].eq(1)]
    temp.reset_index(drop=True, inplace=True)
    temp.to_csv(out_dir + key + '.csv')
    temp.number_of_visits.hist(color=colors[idx], alpha=0.4)