In [None]:
import os
import logging
from google.cloud import storage
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] \
= "data/key/healthdatanexus-review-d7f7ef30e829.json"
import pandas as pd
import numpy as np
import plotly.express as px
import warnings

In [None]:
test_encount_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/test/test_encounters.csv')
train_encount_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/train/train_encounters.csv')
valid_encount_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/valid/valid_encounters.csv')

In [None]:
## Get encounter data, dropping any duplicates
warnings.filterwarnings("ignore")

encount_dat = test_encount_dat.append(
    train_encount_dat).append(valid_encount_dat)
num_enc = len(encount_dat[['ENCOUNTER_NUM']].drop_duplicates())

# # Merge encounters with numerical values
# encount_dat_pt = encount_dat[['ENCOUNTER_NUM','PATIENT_DK']]
# num_enc = len(encount_dat_pt['ENCOUNTER_NUM'])

In [None]:
encount_dat.corr().to_csv('encounter_corr.csv')


In [None]:
test_numerical_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/test/test_numeric_timeseries_8hr.csv')
train_numerical_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/train/train_numeric_timeseries_8hr.csv')
valid_numerical_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/valid/valid_numeric_timeseries_8hr.csv')

In [None]:
outcomes = encount_dat[['ENCOUNTER_NUM','OUTCOME_ALL']]


numerical_dat = test_numerical_dat.append(
    train_numerical_dat).append(valid_numerical_dat)
num_numerical = len(numerical_dat[['ENCOUNTER_NUM']].drop_duplicates())

numerical_dat = numerical_dat.merge(
    outcomes, on='ENCOUNTER_NUM',how='left')

In [None]:
numerical_dat.corr().to_csv('numerical_corr.csv')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

numerical_corr = pd.read_csv('numerical_corr.csv')[['OUTCOME_ALL']]
numerical_corr_outcome_labels = pd.read_csv('numerical_corr.csv')['Unnamed: 0']

fig, ax = plt.subplots(figsize=(10, 70))

sns.heatmap(numerical_corr, annot=False, yticklabels=np.array(numerical_corr_outcome_labels))

In [None]:
numerical_corr = pd.read_csv('numerical_corr.csv')
numerical_corr = numerical_corr[['Unnamed: 0', 'OUTCOME_ALL']]

In [None]:
encounter_corr = pd.read_csv('encounter_corr.csv')[['OUTCOME_ALL']]

fig, ax = plt.subplots(figsize=(5, 6))
ax.set_title("Encounter Correlation Heatmap")
encounter_corr_outcome_labels = pd.read_csv('encounter_corr.csv')['Unnamed: 0']

sns.heatmap(encounter_corr, annot=False, yticklabels=np.array(encounter_corr_outcome_labels))


In [None]:
test_demo_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/test/test_demographics.csv')
train_demo_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/train/train_demographics.csv')
valid_demo_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/valid/valid_demographics.csv')


In [None]:
patient_dat = test_demo_dat.append(
    train_demo_dat).append(valid_demo_dat)

patient_dat = patient_dat.merge(
    encount_dat[['ENCOUNTER_NUM','PATIENT_DK']], on='ENCOUNTER_NUM',how='left')

patient_housing = patient_dat[['PATIENT_DK','no_housing']].drop_duplicates()
patient_marital = patient_dat[['PATIENT_DK','marital']].drop_duplicates()

housing = patient_housing[['PATIENT_DK','no_housing']].groupby(
    'no_housing',as_index=False).count()
housing['no_housing'] = ['housed','unhoused']
housing.columns = ['no_housing','Count']
housing

In [None]:
fig_housing = px.pie(housing, values='Count', names='no_housing', title='Patient Demographics: Housing Status')
fig_housing.update_layout(paper_bgcolor="#f9f9f9")
fig_housing.update_layout(title_x=0.5)


fig_housing

In [None]:
encount_gender = encount_dat[['ENCOUNTER_NUM','gender']].drop_duplicates()


gender = encount_gender[['ENCOUNTER_NUM','gender']].groupby(
    'gender',as_index=False).count()
gender['gender'] = ['Male','Female']
gender.columns = ['Gender','Count']
gender

In [None]:
fig_gender = px.pie(gender, values='Count', names='Gender', title='Encounters: Gender')
fig_gender.update_layout(paper_bgcolor="#f9f9f9")
fig_gender.update_layout(title_x=0.5)


fig_gender

In [None]:
encount_entry = encount_dat[['ENCOUNTER_NUM','time_to_event']].drop_duplicates()
encount_entry['time_to_event'] = encount_entry['time_to_event'].astype(int)
encount_entry = encount_entry[encount_entry['time_to_event'] < 500]

entry = encount_entry[['ENCOUNTER_NUM','time_to_event']].groupby(
    'time_to_event',as_index=False).count()
entry.columns = ['Time from GIM Entry to Outcome','Frequency']
total_frenquency = entry['Frequency'].sum()

entry['Frequency']= entry['Frequency'].div(total_frenquency / 100)

In [None]:
fig_entry = px.bar(entry, x="Time from GIM Entry to Outcome", y="Frequency", title='Encounters: Time from GIM Entry to Outcome',
                 labels={"Time from GIM Entry to Outcome" : "Time from GIM Entry to Outcome (Hours)",
                         "Frequency" : "Frequency (%)"})
fig_entry.update_layout(paper_bgcolor="#ffffff")
fig_entry.update_traces(marker_color='red')
fig_entry.update_layout(title_x=0.5)

fig_entry

In [None]:
test_baseline_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/test/test_baseline_values_8hr.csv')
train_baseline_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/train/train_baseline_values_8hr.csv')
valid_baseline_dat = pd.read_csv(
    'smh-gim-1.0.0.healthdatanexus.ai/pre-processed/valid/valid_baseline_values_8hr.csv')

In [None]:
baseline_dat = test_baseline_dat.append(
    train_baseline_dat).append(valid_baseline_dat)
lab_aneut = baseline_dat[['ENCOUNTER_NUM','lab_aneut']].drop_duplicates().append(numerical_dat[['ENCOUNTER_NUM','lab_aneut']].drop_duplicates())
lab_aneut['lab_aneut'] = lab_aneut['lab_aneut'].round(2)
lab_aneut = lab_aneut[lab_aneut['lab_aneut'] < 0.6]


aneut = lab_aneut[['ENCOUNTER_NUM','lab_aneut']].groupby(
    'lab_aneut',as_index=False).count()
aneut.columns = ['Absolute Neutrophils','Frequency']
total_frenquency = aneut['Frequency'].sum()

aneut['Frequency']= aneut['Frequency'].div(total_frenquency / 100)

In [None]:
fig_entry = px.bar(aneut, x="Absolute Neutrophils", y="Frequency", title="Labs: Absolute Neutrophils",
                 labels={"Absolute Neutrophils" : "Absolute Neutrophils",
                         "Frequency" : "Frequency (%)"})
fig_entry.update_layout(paper_bgcolor="#ffffff")
fig_entry.update_traces(marker_color='red')
fig_entry.update_layout(title_x=0.5)

fig_entry

In [None]:
lab_rdw = baseline_dat[['ENCOUNTER_NUM','lab_rdw']].drop_duplicates().append(numerical_dat[['ENCOUNTER_NUM','lab_rdw']].drop_duplicates())
lab_rdw['lab_rdw'] = lab_rdw['lab_rdw'].round(2)
lab_rdw = lab_rdw[lab_rdw['lab_rdw'] < 0.6]


rdw = lab_rdw[['ENCOUNTER_NUM','lab_rdw']].groupby(
    'lab_rdw',as_index=False).count()
rdw.columns = ['Red Cell Distribution Width','Frequency']
total_frenquency = rdw['Frequency'].sum()

rdw['Frequency']= rdw['Frequency'].div(total_frenquency / 100)

In [None]:
fig_entry = px.bar(rdw, x="Red Cell Distribution Width", y="Frequency", title="Labs: Red Cell Distribution Width",
                 labels={"Red Cell Distribution Width" : "Red Cell Distribution Width",
                         "Frequency" : "Frequency (%)"})
fig_entry.update_layout(paper_bgcolor="#ffffff")
fig_entry.update_traces(marker_color='red')
fig_entry.update_layout(title_x=0.5)

fig_entry