In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
csv_root = Path("/home/moritz/repositories/med_leak/data/csv")
train_df = pd.read_csv(csv_root / "mimic-iv-ed_train.csv")
test_df = pd.read_csv(csv_root / "mimic-iv-ed_test.csv")
df = pd.concat([train_df, test_df], ignore_index=True)

In [3]:
len(df), df["subject_id"].nunique()

(418007, 201213)

In [4]:
list(df.columns)

['index',
 'subject_id',
 'hadm_id',
 'stay_id',
 'intime',
 'outtime',
 'gender',
 'race',
 'arrival_transport',
 'disposition',
 'anchor_age',
 'anchor_year',
 'dod',
 'admittime',
 'dischtime',
 'deathtime',
 'ethnicity',
 'edregtime',
 'edouttime',
 'insurance',
 'in_year',
 'age',
 'outcome_inhospital_mortality',
 'ed_los',
 'intime_icu',
 'time_to_icu_transfer',
 'outcome_icu_transfer_12h',
 'outcome_hospitalization',
 'outcome_critical',
 'n_ed_30d',
 'n_ed_90d',
 'n_ed_365d',
 'next_ed_visit_time',
 'next_ed_visit_time_diff',
 'outcome_ed_revisit_3d',
 'n_hosp_30d',
 'n_hosp_90d',
 'n_hosp_365d',
 'n_icu_30d',
 'n_icu_90d',
 'n_icu_365d',
 'ed_los_hours',
 'time_to_icu_transfer_hours',
 'next_ed_visit_time_diff_days',
 'triage_temperature',
 'triage_heartrate',
 'triage_resprate',
 'triage_o2sat',
 'triage_sbp',
 'triage_dbp',
 'triage_pain',
 'triage_acuity',
 'chiefcomplaint',
 'chiefcom_chest_pain',
 'chiefcom_abdominal_pain',
 'chiefcom_headache',
 'chiefcom_shortness_of_br

In [5]:
# summarize values for race column
white = 'White'
asian = 'Asian'
black = 'Black'
other = "Other/Unknown"

black_mask = (df.race.str.contains("BLACK", na=False))
white_mask = (df.race.str.contains("WHITE", na=False))
asian_mask = (df.race.str.contains("ASIAN", na=False))
other_mask = np.logical_not(black_mask | white_mask | asian_mask)

df.loc[black_mask, "race"] = black
df.loc[white_mask, "race"] = white
df.loc[asian_mask, "race"] = asian
df.loc[other_mask, "race"] = other

In [14]:
df.gender = df.gender.replace({"M":"Male", "F":"Female"})
df.insurance = df.insurance.fillna("Other")

In [15]:
from utils import cohort_table

In [16]:
other_vars = ['gender', "eci_Obesity", "cci_Cancer1", "outcome_hospitalization", "insurance"]
cohort_df = cohort_table(df, strat_var='race', strat_var_vals=['Asian', 'Black', 'White', 'Other/Unknown'], other_vars=other_vars, patient_id_col="subject_id", include_age=False)
cohort_df

OrderedDict([('Variable', ['Patients', 'Records']), ('All', ['201213', '418007']), ('Asian', ['11373', '18319']), ('Black', ['32742', '92151']), ('White', ['126273', '242617']), ('Other/Unknown', ['33970', '64920'])])


Unnamed: 0,Variable,All,Asian,Black,White,Other/Unknown
0,Patients,201213,11373,32742,126273,33970
1,Records,418007,18319,92151,242617,64920
2,Male (\%),191039 (45.7),8072 (44.1),36461 (39.6),117293 (48.3),29213 (45.0)
3,Female (\%),226968 (54.3),10247 (55.9),55690 (60.4),125324 (51.7),35707 (55.0)
4,1 (\%),36850 (8.8),181 (1.0),11262 (12.2),20540 (8.5),4867 (7.5)
5,0 (\%),381157 (91.2),18138 (99.0),80889 (87.8),222077 (91.5),60053 (92.5)
6,0 (\%),391014 (93.5),17139 (93.6),87187 (94.6),224520 (92.5),62168 (95.8)
7,1 (\%),26993 (6.5),1180 (6.4),4964 (5.4),18097 (7.5),2752 (4.2)
8,False (\%),220233 (52.7),11197 (61.1),55931 (60.7),113428 (46.8),39677 (61.1)
9,True (\%),197774 (47.3),7122 (38.9),36220 (39.3),129189 (53.2),25243 (38.9)


In [17]:
cohort_df.to_latex("./tables/mimic-iv.tex", index=False)

In [18]:
df.insurance.value_counts(dropna=False)

insurance
Other       323265
Medicare     77251
Medicaid     17491
Name: count, dtype: int64