In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

-Histogram of how many patients have multiple visits  
-Look at characterizing patient's ages, gender, race/ethnicity perhaps

### Importing useful packages/modules

In [2]:
import numpy as np
import pandas as pd
import sys
import seaborn as sns
import matplotlib.pyplot as plt
import math
from scipy import stats
from scipy.stats import mannwhitneyu, spearmanr
from statsmodels.sandbox.stats.multicomp import multipletests

In [3]:
sys.path.append('./../flm/src/')
import flm_tools

sys.path.append('./../tst/src/')
import utils
import feature_conversion

sys.path.append('./../general/src')
from access_script_data import patients

### Loading and cleaning tables for downstream analysis

In [4]:
clean_all_table = flm_tools.get_cleaned_data(outcome_encoding='three')

In [5]:
measurement_columns = ['patient_ir_id', 'cohort_patient_id', 'measurement_id',
       'measurement_concept_id', 'measurement_concept_name',
       'measurement_vocabulary_id', 'measurement_concept_class_id',
       'measurement_concept_code', 'measurement_date', 'measurement_time',
       'measurement_datetime', 'measurement_type_concept_id',
       'measurement_type_concept_name', 'measurement_type_vocabulary_id',
       'measurement_type_concept_class_id', 'measurement_type_concept_code',
       'operator_concept_id', 'operator_concept_name',
       'operator_vocabulary_id', 'operator_concept_class_id',
       'operator_concept_code', 'value_as_number', 'value_as_concept_id',
       'value_as_concept_name', 'value_as_vocabulary_id',
       'value_as_concept_class_id', 'value_as_concept_code', 'unit_concept_id',
       'unit_concept_name', 'unit_vocabulary_id', 'unit_concept_class_id',
       'unit_concept_code', 'range_low', 'range_high', 'provider_id',
       'visit_occurrence_id', 'visit_detail_id', 'measurement_source_value',
       'measurement_source_concept_id', 'measurement_source_concept_name',
       'measurement_source_vocabulary_id',
       'measurement_source_concept_class_id',
       'measurement_source_concept_code', 'unit_source_value',
       'value_source_value', 'meta_orignl_load_dts']

In [6]:
# Eliminating measurement columns to erase duplicates
clean_all_table = clean_all_table.drop(measurement_columns,
                                       axis='columns').drop_duplicates()

In [7]:
diagnosis = patients.modified_edw_rc('pneumonia_episode_category_assessment', revision='latest',
                                     columns=['case_number', 'clin_cap_viral_npop', 'pneu_assess_dt', 'pt_category'])

diagnosis = feature_conversion.shorten_pt_category(diagnosis)
diagnosis = diagnosis.dropna(subset=['pt_category'])
diagnosis['pt_category'] = diagnosis['pt_category'].replace({'VAP': 'HAP/VAP',
                                                            'HAP': 'HAP/VAP'})
diagnosis = diagnosis.drop_duplicates(subset=['case_number'], keep='last')

clean_all_table = pd.merge(clean_all_table, diagnosis)

#### How many ICU cases per patient

In [8]:
a = clean_all_table['discharge_disposition_name_conv'] == 0
expired_pt = clean_all_table.loc[a]
survived_pt = clean_all_table.loc[~a]

In [9]:
expired_pt

Unnamed: 0,case_number,discharge_disposition_name_conv,gender_concept_name,race_concept_name,ethnicity_concept_name,admission_datetime,discharge_datetime,hospital_los_days,pt_age,clin_cap_viral_npop,pneu_assess_dt,pt_category
2,1184,0,Female,White,Not Hispanic or Latino,2020-01-12 18:31:00,2020-01-30 19:00:00,18.0,46,,2020-01-17,HAP/VAP
4,1131,0,Male,White,Not Hispanic or Latino,2019-09-03 21:55:00,2019-09-18 13:44:00,15.0,49,,2019-09-04,HAP/VAP
6,1304,0,Male,White,Not Hispanic or Latino,2020-05-06 21:09:00,2020-07-29 11:09:00,84.0,36,,2020-06-15,HAP/VAP
10,1002,0,Female,White,Not Hispanic or Latino,2018-06-21 17:07:00,2018-07-21 10:30:00,30.0,64,,2018-06-26,HAP/VAP
26,1003,0,Male,White,Hispanic or Latino,2018-06-24 09:52:00,2018-07-11 23:57:00,17.0,75,,2018-06-30,HAP/VAP
...,...,...,...,...,...,...,...,...,...,...,...,...
299,1329,0,Male,Asian,Not Hispanic or Latino,2020-06-17 03:16:00,2020-08-18 11:30:00,62.0,51,,2020-07-17,HAP/VAP
302,1365,0,Female,Black or African American,Not Hispanic or Latino,2020-08-21 04:50:00,2020-09-04 15:24:00,14.0,74,,2020-08-24,HAP/VAP
303,1377,0,Male,White,Not Hispanic or Latino,2020-09-02 10:20:00,2020-10-13 20:38:00,41.0,62,,2020-09-08,HAP/VAP
304,1378,0,Female,White,Not Hispanic or Latino,2020-09-04 15:51:00,2020-10-06 16:28:00,32.0,78,,2020-09-09,HAP/VAP


In [None]:
survived_pt.describe()

#### Ages of patients

#### Is there an age difference between patients who die, and those who survive?

In [None]:
a = clean_all_table['discharge_disposition_name_conv'] == 0
expired_pt = list(clean_all_table.loc[a, 'pt_age'])
survived_pt = list(clean_all_table.loc[~a, 'pt_age'])

In [None]:
fig, ax = plt.subplots(1,1, figsize=(8,8), constrained_layout=True)
sns.distplot(clean_all_table['hospital_los_days'], bins='fd', kde=False, ax=ax)
# sns.distplot(survived_pt, bins='fd', kde=False, color='royalblue', ax=ax, label='Survived')
ax.set_xlabel('Length of Stay (LOS) (days)',fontsize=22)
ax.set_ylabel('Patient count',fontsize=22)
ax.vlines(np.median(clean_all_table['hospital_los_days']),0,55,color='r',label=f"Median: {np.median(clean_all_table['hospital_los_days']):.0f} years")
ax.vlines(np.percentile(clean_all_table['hospital_los_days'], 25),0,55,color='b',label=f"IQR: {np.percentile(clean_all_table['hospital_los_days'], 75)-np.percentile(clean_all_table['hospital_los_days'], 25):.0f} years")
ax.vlines(np.percentile(clean_all_table['hospital_los_days'], 75),0,55,color='b')
ax.grid(linestyle=':')
ax.tick_params(axis='x', labelsize=22)
ax.tick_params(axis='y', labelsize=22)
ax.legend(loc='best', fontsize=15)
# plt.savefig("age_distribution_210602.png")
plt.show()

In [None]:
# ax1 = sns.boxplot(x="discharge_disposition_name_conv", y="pt_age", data=clean_all_table)
# ax1 = sns.swarmplot(x="discharge_disposition_name_conv", y="pt_age", data=clean_all_table, color=".2")

In [None]:
cdf_expired = flm_tools.get_survival_fn(expired_pt)
cdf_survived = flm_tools.get_survival_fn(survived_pt)

In [None]:
fig1, eje = plt.subplots(1, 1, figsize=(8,8), constrained_layout=True)
eje.step(cdf_expired[0],cdf_expired[1], color='r', label='Expired')
eje.step(cdf_survived[0],cdf_survived[1], color='limegreen', label="Survived")
eje.set_xlabel('Patient age at study enrollment (years)', fontsize=20)
eje.set_ylabel('Fraction of patients remaining', fontsize=20)
eje.tick_params(axis='x', labelsize=20)
eje.tick_params(axis='y', labelsize=20)
eje.grid(linestyle=':')
eje.legend(loc='best', fontsize=15)
# plt.savefig("age_CDF_210602.png")
plt.show()

In [None]:
stat, pvalue = stats.mannwhitneyu(expired_pt, survived_pt)

summary = {'Statistics': ['test statistic','p-value'],
           'Age difference': [stat,pvalue]}

pd.DataFrame(summary)

#### And what about ternary outcomes?

In [None]:
exp = clean_all_table['discharge_disposition_name_conv'] == 0
trans = clean_all_table['discharge_disposition_name_conv'] == 1
home = clean_all_table['discharge_disposition_name_conv'] == 2
expired_pt = list(clean_all_table.loc[exp, 'pt_age'])
trans_pt = list(clean_all_table.loc[trans, 'pt_age'])
home_pt = list(clean_all_table.loc[home, 'pt_age'])

In [None]:
fig2, ax1 = plt.subplots(1,1, figsize=(8,7))
sns.distplot(expired_pt, bins='fd', kde=False, color='red', ax=ax1, label='Expired')
sns.distplot(trans_pt, bins='fd', kde=False, color='orange', ax=ax1, label='Transferred')
sns.distplot(home_pt, bins='fd', kde=False, color='green', ax=ax1, label='Sent Home')
ax1.set_xlabel('Patient age at study enrollment (years)',fontsize=15)
ax1.set_ylabel('# of patients',fontsize=15)
ax1.grid(linestyle=':')
ax1.tick_params(axis='x', labelsize=15)
ax1.tick_params(axis='y', labelsize=15)
# ax1.legend(loc='best', fontsize=12)
# plt.savefig("age_distribution_ternary_210602.png")
plt.show()

In [None]:
cdf_expired = flm_tools.get_survival_fn(expired_pt)
cdf_trans = flm_tools.get_survival_fn(trans_pt)
cdf_home = flm_tools.get_survival_fn(home_pt)

In [None]:
fig3, eje1 = plt.subplots(1, 1, figsize=(8,8), constrained_layout=True)
eje1.step(cdf_expired[0],cdf_expired[1], color='red', label='Expired')
eje1.step(cdf_trans[0],cdf_trans[1], color='orange', label='Transferred')
eje1.step(cdf_home[0],cdf_home[1], color='green', label="Sent home")
eje1.set_xlabel('Patient age at study enrollment (years)', fontsize=20)
eje1.set_ylabel('Fraction of patients remaining', fontsize=20)
eje1.tick_params(axis='x', labelsize=20)
eje1.tick_params(axis='y', labelsize=20)
eje1.grid(linestyle=':')
eje1.legend(loc='best', fontsize=15)
# plt.savefig("age_CDF_ternary_210602.png")
plt.show()

In [None]:
stat, pvaluek = stats.kruskal(expired_pt, trans_pt, home_pt)

summary = {'Statistics': ['Test_statistic', 'p-value'],
           'Age CDFs': [stat, pvaluek]}

pd.DataFrame(summary)

In [None]:
import scikit_posthocs as skp
skp.posthoc_mannwhitney(clean_all_table[['discharge_disposition_name_conv','pt_age']], val_col='pt_age', group_col='discharge_disposition_name_conv', p_adjust='bonferroni')

#### Binary Gender

In [None]:
c = clean_all_table['gender_concept_name'].value_counts()

# f3, ax6 = plt.subplots(1,1, figsize=(8,5))
# ax6.bar(c.index, c.values)
# ax6.set_xlabel('Binary Gender')
# ax6.set_ylabel('Count')
# plt.show()

conteo = c.values
sex = c.index

df1 = pd.DataFrame({"Sex": sex, "counts": conteo, "fraction": conteo/sum(conteo)})

plt.figure(figsize=(8, 8), constrained_layout=True)
splot=sns.barplot(x="Sex", y="fraction", data=df1, color='#2CBDFE')
splot.set(ylabel=None)
splot.set(xlabel=None)

for p in splot.patches:
    splot.annotate(format(p.get_height()*sum(conteo), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', size=20, 
                   xytext = (0, -12), 
                   textcoords = 'offset points')
plt.ylabel('Fraction of patients', size=22)
plt.tick_params(axis='x', labelsize=22)
plt.tick_params(axis='y', labelsize=22)
plt.grid(linestyle=':', axis='y')
# plt.savefig("patients_by_sex_210602.png")

In [None]:
f = clean_all_table['discharge_disposition_name_conv'] == 0
expired = clean_all_table.loc[f, 'gender_concept_name']
survived = clean_all_table.loc[~f, 'gender_concept_name']

e = expired.value_counts()
s = survived.value_counts()

In [None]:
male_expired = list((expired == 'Male').astype(int))
boot_male_expired = np.random.choice(male_expired, size=(10000,len(male_expired)), replace=True)

male_survived = list((survived == 'Male').astype(int))
boot_male_survived = np.random.choice(male_survived, size=(10000,len(male_survived)), replace=True)

male_means = [np.mean(np.mean(boot_male_expired,axis=1)),
              np.mean(np.mean(boot_male_survived, axis=1))]

male_CIs = np.array([[male_means[0] - np.percentile(np.mean(boot_male_expired,axis=1), 0.5),
                 male_means[1] - np.percentile(np.mean(boot_male_survived, axis=1), 0.5)],
                [np.percentile(np.mean(boot_male_expired,axis=1), 99.5) - male_means[0],
                 np.percentile(np.mean(boot_male_survived, axis=1), 99.5) - male_means[1]]])

In [None]:
labels_binary = ['Expired', 'Survived']
share_men_binary = [e[0]/(e[0]+e[1]), s[0]/(s[0]+s[1])]
share_women_binary = [e[1]/(e[0]+e[1]), s[1]/(s[0]+s[1])]

fig4, ax2 = plt.subplots(1,1,figsize=(8,8))
ax2.bar(labels_binary, share_men_binary, width=0.35, label='Male')
ax2.bar(labels_binary, share_women_binary, width=0.35, bottom=share_men_binary, label='Female')
ax2.tick_params(axis='x', labelsize=20)
ax2.tick_params(axis='y', labelsize=20)
ax2.legend(loc='best', fontsize=12)
ax2.grid(linestyle=':', axis='y')
ax2.set_ylabel('Sex fraction', size=20)
plt.axis((-0.5, 1.5, 0.0, 1.15))
# plt.savefig("sex_proportions_210602.png")

In [None]:
fig5, ax3 = plt.subplots(1, 1, figsize=(8,8), constrained_layout=True)

ax3.bar(labels_binary, male_means, yerr=male_CIs, color=['r','limegreen'])
ax3.hlines(df1.loc[0,'fraction'], ax3.get_xlim()[0], ax3.get_xlim()[-1], color='indigo', linestyles='dashed', label=f"Cohort's male fraction ({df1.loc[0,'fraction']:.3f})")
ax3.tick_params(axis='x', labelsize=20)
ax3.tick_params(axis='y', labelsize=20)
ax3.set_ylabel('Fraction male', size=20)
ax3.legend(loc='upper left', fontsize=15)

# plt.savefig('asdfasd1.png')

In [None]:
from scipy.stats import chi2_contingency
chi2, pvalue, dof, ex = chi2_contingency(np.array([[s[0], e[0]], [s[1], e[1]]]))
print(chi2, pvalue, dof)

And by three outcomes

In [None]:
exp = clean_all_table['discharge_disposition_name_conv'] == 0
trans = clean_all_table['discharge_disposition_name_conv'] == 1
hom = clean_all_table['discharge_disposition_name_conv'] == 2

expired = clean_all_table.loc[exp, 'gender_concept_name']
transferred = clean_all_table.loc[trans, 'gender_concept_name']
home = clean_all_table.loc[hom, 'gender_concept_name']

e = expired.value_counts()
t = transferred.value_counts()
h = home.value_counts()

In [None]:
labels_ternary = ['Expired', 'Transferred', 'Sent Home']
# share_men_ternary = [e[0]/(e[0]+e[1]), t[0]/(t[0]+t[1]), h[0]/(h[0]+h[1])]
# share_women_ternary = [e[1]/(e[0]+e[1]), t[1]/(t[0]+t[1]), h[1]/(h[0]+h[1])]

# fig5, ax3 = plt.subplots(1,1,figsize=(8,8))
# ax3.bar(labels_ternary, share_men_ternary, width=0.35, label='Male')
# ax3.bar(labels_ternary, share_women_ternary, width=0.35, bottom=share_men_ternary, label='Female')
# ax3.tick_params(axis='x', labelsize=20)
# ax3.tick_params(axis='y', labelsize=20)
# ax3.legend(loc='best', fontsize=12)
# ax3.grid(linestyle=':', axis='y')
# ax3.set_ylabel('Sex fraction', size=20)
# plt.axis((-0.5, 2.5, 0.0, 1.15))
# plt.show()
# # plt.savefig("sex_proportions_ternary_210602.png")

In [None]:
# fig6, axes = plt.subplots(ncols=2, nrows=1, figsize=(12,6), constrained_layout=True)
# axes[0].bar(labels_binary, share_men_binary, width=0.35, label='Male')
# axes[0].bar(labels_binary, share_women_binary, width=0.35, bottom=share_men_binary, label='Female')
# axes[0].tick_params(axis='x', labelsize=20)
# axes[0].tick_params(axis='y', labelsize=20)
# axes[0].grid(linestyle=':', axis='y')
# axes[0].set_ylabel('Sex fraction', size=20)
# axes[0].axis((-0.5, 1.5, 0.0, 1.15))
# axes[0].text(0.01, 0.98, 'a', transform=axes[0].transAxes,
#       fontsize=23, va='top')

# axes[1].bar(labels_ternary, share_men_ternary, width=0.35, label='Male')
# axes[1].bar(labels_ternary, share_women_ternary, width=0.35, bottom=share_men_ternary, label='Female')
# axes[1].tick_params(axis='x', labelsize=20)
# axes[1].tick_params(left=False, labelleft=False)
# axes[1].legend(loc='best', fontsize=13)
# axes[1].grid(linestyle=':', axis='y')
# axes[1].axis((-0.5, 2.5, 0.0, 1.15))
# axes[1].text(0.01, 0.98, 'b', transform=axes[1].transAxes,
#       fontsize=23, va='top')

# # plt.savefig("sex_proportions_210602.png")

In [None]:
# from scipy.stats import chi2_contingency
# chi2, pvalue, dof, ex = chi2_contingency(np.array([[e[0], t[0], h[0]], [e[1], t[1], h[1]]]))
# print(chi2, pvalue, dof)

In [None]:
# np.array([[e[0], t[0], h[0]], [e[1], t[1], h[1]]])

Or I can bootstrap gender fractions

In [None]:
male_expired = list((expired == 'Male').astype(int))
boot_male_expired = np.random.choice(male_expired, size=(10000,len(male_expired)), replace=True)

male_transferred = list((transferred == 'Male').astype(int))
boot_male_transferred = np.random.choice(male_transferred, size=(10000,len(male_transferred)), replace=True)

male_home = list((home == 'Male').astype(int))
boot_male_home = np.random.choice(male_home, size=(10000,len(male_home)), replace=True)

male_means = [np.mean(np.mean(boot_male_expired,axis=1)),
         np.mean(np.mean(boot_male_transferred, axis=1)),
         np.mean(np.mean(boot_male_home, axis=1))]

male_CIs = np.array([[male_means[0] - np.percentile(np.mean(boot_male_expired,axis=1), 0.5),
                 male_means[1] - np.percentile(np.mean(boot_male_transferred, axis=1), 0.5),
                 male_means[2] - np.percentile(np.mean(boot_male_home, axis=1), 0.5)],
                [np.percentile(np.mean(boot_male_expired,axis=1), 99.5) - male_means[0],
                 np.percentile(np.mean(boot_male_transferred, axis=1), 99.5) - male_means[1],
                 np.percentile(np.mean(boot_male_home, axis=1), 99.5) - male_means[2]]])

In [None]:
np.percentile(np.mean(boot_male_home, axis=1), 0.5)

In [None]:
fig5, ax3 = plt.subplots(1, 1, figsize=(8,8), constrained_layout=True)

ax3.bar(labels_ternary, male_means, yerr=male_CIs, color=['r','orange','g'])
ax3.hlines(df1.loc[0,'fraction'], ax3.get_xlim()[0], ax3.get_xlim()[-1], color='indigo', linestyles='dashed', label=f"Cohort's male fraction ({df1.loc[0,'fraction']:.3f})")
ax3.tick_params(axis='x', labelsize=20)
ax3.tick_params(axis='y', labelsize=20)
ax3.set_ylabel('Fraction male', size=20)
ax3.legend(loc='upper left', fontsize=15)

# plt.savefig('asdfasd.png')

#### Some people really want to look at race/ethnicity breakdowns...

In [None]:
# clean_all_table['race_ethnicity'] = clean_all_table['race_concept_name'] + ' ' + clean_all_table['ethnicity_concept_name']

In [None]:
# clean_all_table['race_ethnicity'] = clean_all_table['race_ethnicity'].replace({'Black or African American Not Hispanic or Latino': 'Black',
#                                                                                'No matching concept No matching concept': 'Unknown',
#                                                                               'White Not Hispanic or Latino': 'White',
#                                                                               'White No matching concept': 'White',
#                                                                               'No matching concept Not Hispanic or Latino': 'Unknown',
#                                                                               'White Hispanic or Latino': 'Hispanic White',
#                                                                               'No matching concept Hispanic or Latino': 'Hispanic',
#                                                                               'Asian Not Hispanic or Latino': 'Asian',
#                                                                               'Asian Hispanic or Latino': 'Hispanic Asian',
#                                                                               'American Indian or Alaska Native Not Hispanic or Latino': 'Native'})
# clean_all_table['race_ethnicity'].value_counts()

#### Looking at pneumonia type breakdown

In [None]:
d = clean_all_table['pt_category'].value_counts()

count = d.values
category = d.index

df2 = pd.DataFrame({"Disease type": category, "counts": count, "fraction": count/sum(count)})

plt.figure(figsize=(8, 8), constrained_layout=True)
splot=sns.barplot(x="Disease type", y="fraction", data=df2, color='#2CBDFE')
splot.set(xlabel=None)

for p in splot.patches:
    splot.annotate(format(p.get_height()*sum(count), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', size=25, 
                   xytext = (0, -12), 
                   textcoords = 'offset points')
# plt.xlabel('Disease type', size=15)
plt.ylabel('Fraction of patients', size=22)
plt.tick_params(axis='x', labelsize=22)
plt.tick_params(axis='y', labelsize=22)
plt.grid(linestyle=':', axis='y')
# plt.savefig("patient_category_210602.png")

In [None]:
exp = clean_all_table['discharge_disposition_name_conv'] == 0

expired = clean_all_table.loc[exp, 'pt_category']
survived = clean_all_table.loc[~exp, 'pt_category']

e = expired.value_counts()
s = survived.value_counts()

In [None]:
# from scipy.stats import chi2_contingency
# contingency = [[e[0], s[0]], [e[1], s[1]], [e[2], s[2]]]
# chi2, pvalue, dof, ex = chi2_contingency(np.array(contingency))
# print(chi2, pvalue, dof)

In [None]:
hap_expired = list((expired == 'HAP/VAP').astype(int))
hap_survived = list((survived == 'HAP/VAP').astype(int))

boot_hap_expired = np.random.choice(hap_expired, size=(10000,len(hap_expired)), replace=True)
boot_hap_survived = np.random.choice(hap_survived, size=(10000,len(hap_survived)), replace=True)


cap_expired = list((expired == 'CAP').astype(int))
cap_survived = list((survived == 'CAP').astype(int))

boot_cap_expired = np.random.choice(cap_expired, size=(10000,len(cap_expired)), replace=True)
boot_cap_survived = np.random.choice(cap_survived, size=(10000,len(cap_survived)), replace=True)


non_expired = list((expired == 'non').astype(int))
non_survived = list((survived == 'non').astype(int))

boot_non_expired = np.random.choice(non_expired, size=(10000,len(non_expired)), replace=True)
boot_non_survived = np.random.choice(non_survived, size=(10000,len(non_survived)), replace=True)

In [None]:
expired_means = [np.mean(np.mean(boot_hap_expired,axis=1)),
                 np.mean(np.mean(boot_cap_expired,axis=1)),
                 np.mean(np.mean(boot_non_expired,axis=1))]

expired_CIs = np.array([[expired_means[0] - np.percentile(np.mean(boot_hap_expired,axis=1), 0.5),
                         expired_means[1] - np.percentile(np.mean(boot_cap_expired, axis=1), 0.5),
                         expired_means[2] - np.percentile(np.mean(boot_non_expired, axis=1), 0.5)],
                        [np.percentile(np.mean(boot_hap_expired,axis=1), 99.5) - expired_means[0],
                         np.percentile(np.mean(boot_cap_expired, axis=1), 99.5) - expired_means[1],
                         np.percentile(np.mean(boot_non_expired, axis=1), 99.5) - expired_means[2]]])


survived_means = [np.mean(np.mean(boot_hap_survived, axis=1)),
                     np.mean(np.mean(boot_cap_survived, axis=1)),
                     np.mean(np.mean(boot_non_survived, axis=1))]

survived_CIs = np.array([[survived_means[0] - np.percentile(np.mean(boot_hap_survived,axis=1), 0.5),
                             survived_means[1] - np.percentile(np.mean(boot_cap_survived, axis=1), 0.5),
                             survived_means[2] - np.percentile(np.mean(boot_non_survived, axis=1), 0.5)],
                            [np.percentile(np.mean(boot_hap_survived,axis=1), 99.5) - survived_means[0],
                             np.percentile(np.mean(boot_cap_survived, axis=1), 99.5) - survived_means[1],
                             np.percentile(np.mean(boot_non_survived, axis=1), 99.5) - survived_means[2]]])

In [None]:
width=0.25
x = np.arange(len(category))

fig7, ax4 = plt.subplots(1,1,figsize=(8,8), constrained_layout=True)
ax4.bar(x - width/2, expired_means, width, yerr=expired_CIs, label='Expired', color='r')
ax4.bar(x + width/2, survived_means, width, yerr=survived_CIs, label='Survived', color='limegreen')

ax4.axis((-0.7, 2.7, 0.0, 0.9))

ax4.hlines(df2.loc[df2['Disease type']=='HAP/VAP', 'counts']/(df2['counts'].sum()),
           x[0]-1.5*width, x[0]+1.5*width,
           color='indigo', linestyles='dashed', label='Cohort category fractions')
ax4.hlines(df2.loc[df2['Disease type']=='CAP', 'counts']/(df2['counts'].sum()),
           x[1]-1.5*width, x[1]+1.5*width,
           color='indigo', linestyles='dashed')
ax4.hlines(df2.loc[df2['Disease type']=='non', 'counts']/(df2['counts'].sum()),
           x[2]-1.5*width, x[2]+1.5*width,
           color='indigo', linestyles='dashed')

ax4.tick_params(axis='x', labelsize=20)
ax4.tick_params(axis='y', labelsize=20)
ax4.set_xticks(x)
ax4.set_xticklabels(category)
ax4.set_ylabel('Pneumonia category fraction', size=20)
ax4.legend(loc='best', fontsize=15)

# plt.savefig("categorias.png")

In [None]:
exp = clean_all_table['discharge_disposition_name_conv'] == 0
trans = clean_all_table['discharge_disposition_name_conv'] == 1
hom = clean_all_table['discharge_disposition_name_conv'] == 2

expired = clean_all_table.loc[exp, 'pt_category']
transferred = clean_all_table.loc[trans, 'pt_category']
home = clean_all_table.loc[hom, 'pt_category']

e = expired.value_counts()
t = transferred.value_counts()
h = home.value_counts()

In [None]:
hap_expired = list((expired == 'HAP/VAP').astype(int))
hap_transferred = list((transferred == 'HAP/VAP').astype(int))
hap_home = list((home == 'HAP/VAP').astype(int))

boot_hap_expired = np.random.choice(hap_expired, size=(10000,len(hap_expired)), replace=True)
boot_hap_transferred = np.random.choice(hap_transferred, size=(10000,len(hap_transferred)), replace=True)
boot_hap_home = np.random.choice(hap_home, size=(10000,len(hap_home)), replace=True)


cap_expired = list((expired == 'CAP').astype(int))
cap_transferred = list((transferred == 'CAP').astype(int))
cap_home = list((home == 'CAP').astype(int))

boot_cap_expired = np.random.choice(cap_expired, size=(10000,len(cap_expired)), replace=True)
boot_cap_transferred = np.random.choice(cap_transferred, size=(10000,len(cap_transferred)), replace=True)
boot_cap_home = np.random.choice(cap_home, size=(10000,len(cap_home)), replace=True)


non_expired = list((expired == 'non').astype(int))
non_transferred = list((transferred == 'non').astype(int))
non_home = list((home == 'non').astype(int))

boot_non_expired = np.random.choice(non_expired, size=(10000,len(non_expired)), replace=True)
boot_non_transferred = np.random.choice(non_transferred, size=(10000,len(non_transferred)), replace=True)
boot_non_home = np.random.choice(non_home, size=(10000,len(non_home)), replace=True)

If you want to do bar plots where the xticks are "expired, transferred and sent home", organize means and CIs by pneumonia category

In [None]:
hap_means = [np.mean(np.mean(boot_hap_expired,axis=1)),
         np.mean(np.mean(boot_hap_transferred, axis=1)),
         np.mean(np.mean(boot_hap_home, axis=1))]

hap_CIs = np.array([[hap_means[0] - np.percentile(np.mean(boot_hap_expired,axis=1), 0.5),
                 hap_means[1] - np.percentile(np.mean(boot_hap_transferred, axis=1), 0.5),
                 hap_means[2] - np.percentile(np.mean(boot_hap_home, axis=1), 0.5)],
                [np.percentile(np.mean(boot_hap_expired,axis=1), 99.5) - hap_means[0],
                 np.percentile(np.mean(boot_hap_transferred, axis=1), 99.5) - hap_means[1],
                 np.percentile(np.mean(boot_hap_home, axis=1), 99.5) - hap_means[2]]])


cap_means = [np.mean(np.mean(boot_cap_expired,axis=1)),
         np.mean(np.mean(boot_cap_transferred, axis=1)),
         np.mean(np.mean(boot_cap_home, axis=1))]

cap_CIs = np.array([[cap_means[0] - np.percentile(np.mean(boot_cap_expired,axis=1), 0.5),
                 cap_means[1] - np.percentile(np.mean(boot_cap_transferred, axis=1), 0.5),
                 cap_means[2] - np.percentile(np.mean(boot_cap_home, axis=1), 0.5)],
                [np.percentile(np.mean(boot_cap_expired,axis=1), 99.5) - cap_means[0],
                 np.percentile(np.mean(boot_cap_transferred, axis=1), 99.5) - cap_means[1],
                 np.percentile(np.mean(boot_cap_home, axis=1), 99.5) - cap_means[2]]])


non_means = [np.mean(np.mean(boot_non_expired,axis=1)),
         np.mean(np.mean(boot_non_transferred, axis=1)),
         np.mean(np.mean(boot_non_home, axis=1))]

non_CIs = np.array([[non_means[0] - np.percentile(np.mean(boot_non_expired,axis=1), 0.5),
                 non_means[1] - np.percentile(np.mean(boot_non_transferred, axis=1), 0.5),
                 non_means[2] - np.percentile(np.mean(boot_non_home, axis=1), 0.5)],
                [np.percentile(np.mean(boot_non_expired,axis=1), 99.5) - non_means[0],
                 np.percentile(np.mean(boot_non_transferred, axis=1), 99.5) - non_means[1],
                 np.percentile(np.mean(boot_non_home, axis=1), 99.5) - non_means[2]]])

However, if you want to do bar plots where the xticks are "HAP/VAP, CAP, Non-Pneumonia", organize means and CIs by discharges

In [None]:
expired_means = [np.mean(np.mean(boot_hap_expired,axis=1)),
                 np.mean(np.mean(boot_cap_expired,axis=1)),
                 np.mean(np.mean(boot_non_expired,axis=1))]

expired_CIs = np.array([[expired_means[0] - np.percentile(np.mean(boot_hap_expired,axis=1), 0.5),
                         expired_means[1] - np.percentile(np.mean(boot_cap_expired, axis=1), 0.5),
                         expired_means[2] - np.percentile(np.mean(boot_non_expired, axis=1), 0.5)],
                        [np.percentile(np.mean(boot_hap_expired,axis=1), 99.5) - expired_means[0],
                         np.percentile(np.mean(boot_cap_expired, axis=1), 99.5) - expired_means[1],
                         np.percentile(np.mean(boot_non_expired, axis=1), 99.5) - expired_means[2]]])


transferred_means = [np.mean(np.mean(boot_hap_transferred, axis=1)),
                     np.mean(np.mean(boot_cap_transferred, axis=1)),
                     np.mean(np.mean(boot_non_transferred, axis=1))]

transferred_CIs = np.array([[transferred_means[0] - np.percentile(np.mean(boot_hap_transferred,axis=1), 0.5),
                             transferred_means[1] - np.percentile(np.mean(boot_cap_transferred, axis=1), 0.5),
                             transferred_means[2] - np.percentile(np.mean(boot_non_transferred, axis=1), 0.5)],
                            [np.percentile(np.mean(boot_hap_transferred,axis=1), 99.5) - transferred_means[0],
                             np.percentile(np.mean(boot_cap_transferred, axis=1), 99.5) - transferred_means[1],
                             np.percentile(np.mean(boot_non_transferred, axis=1), 99.5) - transferred_means[2]]])


home_means = [np.mean(np.mean(boot_hap_home, axis=1)),
              np.mean(np.mean(boot_cap_home, axis=1)),
              np.mean(np.mean(boot_non_home, axis=1))]

home_CIs = np.array([[home_means[0] - np.percentile(np.mean(boot_hap_home,axis=1), 0.5),
                      home_means[1] - np.percentile(np.mean(boot_cap_home, axis=1), 0.5),
                      home_means[2] - np.percentile(np.mean(boot_non_home, axis=1), 0.5)],
                     [np.percentile(np.mean(boot_hap_home,axis=1), 99.5) - home_means[0],
                      np.percentile(np.mean(boot_cap_home, axis=1), 99.5) - home_means[1],
                      np.percentile(np.mean(boot_non_home, axis=1), 99.5) - home_means[2]]])

In [None]:
# pneum_breakdown = []

# discharges = [0, 1, 2]
# categories = list(clean_all_table['pt_category'].drop_duplicates())

# for outcome in discharges:
#     f = clean_all_table['discharge_disposition_name_conv'] == outcome
#     tmp_df = clean_all_table.loc[f]
    
#     for pneumonia_episode in categories:
#         g = tmp_df['pt_category'] == pneumonia_episode
#         filtered_df = tmp_df.loc[g]
        
#         if outcome == 0:
#             pneum_breakdown.append({'Counts': len(filtered_df),
#                                     'Fraction': len(filtered_df)/len(expired),
#                                     'Category': pneumonia_episode,
#                                     'Discharge': outcome})
#         elif outcome == 1:
#             pneum_breakdown.append({'Counts': len(filtered_df),
#                                     'Fraction': len(filtered_df)/len(transferred),
#                                     'Category': pneumonia_episode,
#                                     'Discharge': outcome})
#         else:
#             pneum_breakdown.append({'Counts': len(filtered_df),
#                                     'Fraction': len(filtered_df)/len(home),
#                                     'Category': pneumonia_episode,
#                                     'Discharge': outcome})
        
# pneum_category = pd.DataFrame(pneum_breakdown).replace(['non', 0, 1, 2],
#                                                        ['Non-Pneumonia', 'Expired', 'Transferred', 'Sent Home'])

In [None]:
width=0.25
x = np.arange(len(category))

fig7, ax4 = plt.subplots(1,1,figsize=(8,8), constrained_layout=True)
ax4.bar(x-width, expired_means, width, yerr=expired_CIs, label='Expired', color='r')
ax4.bar(x, transferred_means, width, yerr=transferred_CIs, label='Transferred', color='orange')
ax4.bar(x + width, home_means, width, yerr=home_CIs, label='Sent home', color='g')

ax4.axis((-0.7, 2.7, 0.0, 0.9))

ax4.hlines(df2.loc[df2['Disease type']=='HAP/VAP', 'counts']/(df2['counts'].sum()),
           x[0]-1.5*width, x[0]+1.5*width,
           color='indigo', linestyles='dashed', label='Cohort category fractions')
ax4.hlines(df2.loc[df2['Disease type']=='CAP', 'counts']/(df2['counts'].sum()),
           x[1]-1.5*width, x[1]+1.5*width,
           color='indigo', linestyles='dashed')
ax4.hlines(df2.loc[df2['Disease type']=='non', 'counts']/(df2['counts'].sum()),
           x[2]-1.5*width, x[2]+1.5*width,
           color='indigo', linestyles='dashed')

ax4.tick_params(axis='x', labelsize=20)
ax4.tick_params(axis='y', labelsize=20)
ax4.set_xticks(x)
ax4.set_xticklabels(category)
ax4.set_ylabel('Pneumonia category fraction', size=20)
ax4.legend(loc='best', fontsize=15)

# plt.savefig("patient_type_breakdown_210602.png")

In [None]:
# fig8, axes1 = plt.subplots(ncols=2, nrows=1, figsize=(14, 6), constrained_layout=True)
# splot=sns.barplot(x="Disease type", y="counts", data=df2, ax=axes1[0])
# splot.set(xlabel=None)

# for p in splot.patches:
#     splot.annotate(format(p.get_height(), '.0f'), 
#                    (p.get_x() + p.get_width() / 2., p.get_height()), 
#                    ha = 'center', va = 'center', size=25, 
#                    xytext = (0, -12), 
#                    textcoords = 'offset points')
# axes1[0].set_ylabel('Number of patients', size=22)
# axes1[0].tick_params(axis='x', labelsize=22)
# axes1[0].tick_params(left=False, labelleft=False)
# axes1[0].text(0.0, 1.06, 'a', transform=axes1[0].transAxes,
#       fontsize=23, va='top')

# axes1[1].bar(labels, share_HAP, width=0.35, label='HAP/VAP')
# axes1[1].bar(labels, share_CAP, width=0.35, bottom=share_HAP, label='CAP')
# axes1[1].bar(labels, share_non, width=0.35, bottom=height3, label='Non-Pneumonia')
# axes1[1].tick_params(axis='x', labelsize=22)
# axes1[1].tick_params(axis='y', labelsize=22)
# axes1[1].legend(loc='best', fontsize=11)
# axes1[1].grid(linestyle=':', axis='y')
# axes1[1].set_ylabel('Category fraction', size=22)
# axes1[1].axis((-0.5, 2.5, 0.0, 1.25))
# axes1[1].text(0, 1.06, 'b', transform=axes1[1].transAxes,
#       fontsize=23, va='top')

# # plt.savefig("patient_type_breakdown_210602.png")
# plt.show()

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
re = clean_all_table.loc[clean_all_table['pt_category']=='HAP']
HAP_LOS = list(re['hospital_los_days'])

re = clean_all_table.loc[clean_all_table['pt_category']=='CAP']
CAP_LOS = list(re['hospital_los_days'])

re = clean_all_table.loc[clean_all_table['pt_category']=='non']
non_LOS = list(re['hospital_los_days'])

In [None]:
cdf_HAP = flm_tools.get_cdfs(HAP_LOS)
cdf_CAP = flm_tools.get_cdfs(CAP_LOS)
cdf_non = flm_tools.get_cdfs(non_LOS)

In [None]:
clean_all_table = flm_tools.get_cleaned_data(outcome_encoding='three')

diagnosis = patients.modified_edw_rc('pneumonia_episode_category_assessment', revision='latest',
                                     columns=['case_number', 'clin_cap_viral_npop', 'pneu_assess_dt', 'pt_category'])

diagnosis = feature_conversion.shorten_pt_category(diagnosis)
diagnosis = diagnosis.dropna(subset=['pt_category'])
diagnosis['pt_category'] = diagnosis['pt_category'].replace({'VAP': 'HAP'})
diagnosis = diagnosis.drop_duplicates(subset=['case_number'], keep='last')

clean_all_table = pd.merge(clean_all_table, diagnosis)

In [None]:
measurements_during_encounter = clean_all_table.set_index('case_number')
alt_table = []
for case in measurements_during_encounter.index.unique():
    try:
        LOS = measurements_during_encounter.loc[case, 'hospital_los_days'].unique()[0]
    except AttributeError:
        LOS = measurements_during_encounter.loc[case, 'hospital_los_days']
    
    try:
        num_of_tests = len(measurements_during_encounter.loc[case, 'measurement_concept_name'].unique())
    except AttributeError:
        num_of_tests = len([measurements_during_encounter.loc[case, 'measurement_concept_name']])
        
    norm_num = num_of_tests/LOS
    
    try:
        discharge = measurements_during_encounter.loc[case,'discharge_disposition_name_conv'].unique()[0]
    except AttributeError:
        discharge = measurements_during_encounter.loc[case,'discharge_disposition_name_conv']
        
    rows = {'#_unique_tests': num_of_tests, '#_unique_tests_per_day': norm_num, 'outcome': discharge, 'LOS': LOS}
    alt_table.append(rows)

third = pd.DataFrame(alt_table)

In [None]:
third_tests = []
third_tests_norm = []
third_LOS = []
for out in set(third['outcome']):
    f = third['outcome'] == out
    third_tests.append(list(third.loc[f, '#_unique_tests']))
    third_tests_norm.append(list(third.loc[f, '#_unique_tests_per_day']))
    third_LOS.append(list(third.loc[f, 'LOS']))

In [None]:
cdf_expired = flm_tools.get_survival_fn(third_LOS[0])
cdf_transferred = flm_tools.get_survival_fn(third_LOS[1])
cdf_home = flm_tools.get_survival_fn(third_LOS[2])

In [None]:
fig9, ejes = plt.subplots(ncols=1, nrows=1, figsize=(8,8), constrained_layout=True)
ejes.step(cdf_expired[0], cdf_expired[1], color='r', label='Expired')
ejes.step(cdf_transferred[0], cdf_transferred[1], color='orange', label="Transferred")
ejes.step(cdf_home[0], cdf_home[1], color='green', label="Sent home")
ejes.set_xlabel('Length of Stay (days)', fontsize=20)
ejes.set_ylabel('Fraction of patients remaining', fontsize=20)
ejes.tick_params(axis='x', labelsize=20)
ejes.tick_params(axis='y', labelsize=20)
ejes.grid(linestyle=':')
ejes.legend(loc='best', fontsize=15)

# ejes[1].step(cdf_HAP[0],cdf_HAP[1], label='HAP/VAP')
# ejes[1].step(cdf_CAP[0],cdf_CAP[1], label="CAP")
# ejes[1].step(cdf_non[0],cdf_non[1], label="Non-pneumonia")
# ejes[1].set_xlabel('Length of stay (days)', fontsize=20)
# ejes[1].tick_params(axis='x', labelsize=20)
# ejes[1].tick_params(left=False, labelleft=False)
# ejes[1].grid(linestyle=':')
# ejes[1].legend(loc='best', fontsize=14)

# plt.savefig("LOS_comparison_210602.png")

In [None]:
fig10, ejes1 = plt.subplots(ncols=2, nrows=2, figsize=(14, 14))

cdf_expired = flm_tools.get_survival_fn(expired_pt)
cdf_trans = flm_tools.get_survival_fn(trans_pt)
cdf_home = flm_tools.get_survival_fn(home_pt)

ejes1[0,0].step(cdf_expired[0],cdf_expired[1], color='red', label='Expired')
ejes1[0,0].step(cdf_trans[0],cdf_trans[1], color='orange', label='Transferred')
ejes1[0,0].step(cdf_home[0],cdf_home[1], color='green', label="Sent Home")
ejes1[0,0].set_xlabel('Patient age at study enrollment (years)', fontsize=20)
ejes1[0,0].set_ylabel('Fraction of patients remaining', fontsize=20)
ejes1[0,0].tick_params(axis='x', labelsize=20)
ejes1[0,0].tick_params(axis='y', labelsize=20)
ejes1[0,0].grid(linestyle=':')
ejes1[0,0].legend(loc='best', fontsize=15)


ejes1[0,1].bar(labels_ternary, male_means, yerr=male_CIs, color=['r','orange','g'])
ejes1[0,1].axis((-0.7, 2.7, 0.0, 0.9))
ejes1[0,1].hlines(df1.loc[0,'fraction'], ejes1[0,1].get_xlim()[0], ejes1[0,1].get_xlim()[-1],
                  color='indigo',
                  linestyles='dashed',
                  label="Cohort's male fraction (0.593)")
ejes1[0,1].tick_params(axis='x', labelsize=20)
ejes1[0,1].tick_params(axis='y', labelsize=20)
ejes1[0,1].set_ylabel('Fraction male', size=20)
ejes1[0,1].legend(loc='upper left', fontsize=15)


cdf_expired = flm_tools.get_survival_fn(third_LOS[0])
cdf_transferred = flm_tools.get_survival_fn(third_LOS[1])
cdf_home = flm_tools.get_survival_fn(third_LOS[2])

ejes1[1,0].step(cdf_expired[0], cdf_expired[1], color='red', label='Expired')
ejes1[1,0].step(cdf_transferred[0], cdf_transferred[1], color='orange', label="Transferred")
ejes1[1,0].step(cdf_home[0], cdf_home[1], color='green', label="Sent Home")
ejes1[1,0].set_xlabel('Length of Stay (days)', fontsize=20)
ejes1[1,0].set_ylabel('Fraction of patients remaining', fontsize=20)
ejes1[1,0].tick_params(axis='x', labelsize=20)
ejes1[1,0].tick_params(axis='y', labelsize=20)
ejes1[1,0].grid(linestyle=':')
ejes1[1,0].legend(loc='best', fontsize=15)



width=0.25
x = np.arange(len(category))

ejes1[1,1].bar(x-width, expired_means, width, yerr=expired_CIs, label='Expired', color='r')
ejes1[1,1].bar(x, transferred_means, width, yerr=transferred_CIs, label='Transferred', color='orange')
ejes1[1,1].bar(x + width, home_means, width, yerr=home_CIs, label='Sent Home', color='g')

ejes1[1,1].axis((-0.7, 2.7, 0.0, 0.9))

ejes1[1,1].hlines(df2.loc[df2['Disease type']=='HAP/VAP', 'counts']/(df2['counts'].sum()),
           x[0]-1.5*width, x[0]+1.5*width,
           color='indigo', linestyles='dashed', label='Cohort category fractions')
ejes1[1,1].hlines(df2.loc[df2['Disease type']=='CAP', 'counts']/(df2['counts'].sum()),
           x[1]-1.5*width, x[1]+1.5*width,
           color='indigo', linestyles='dashed')
ejes1[1,1].hlines(df2.loc[df2['Disease type']=='non', 'counts']/(df2['counts'].sum()),
           x[2]-1.5*width, x[2]+1.5*width,
           color='indigo', linestyles='dashed')

ejes1[1,1].tick_params(axis='x', labelsize=20)
ejes1[1,1].tick_params(axis='y', labelsize=20)
ejes1[1,1].set_xticks(x)
ejes1[1,1].set_xticklabels(category)
ejes1[1,1].set_ylabel('Pneumonia category fraction', size=20)
ejes1[1,1].legend(loc='best', fontsize=15)

fig10.tight_layout()
# plt.savefig("fig_patients_outcomes.png")