In [1]:
library(RPostgreSQL)
library(twang)
library(Matching)
library(tidyverse)

Loading required package: DBI
Loading required package: gbm
Loading required package: survival
Loading required package: lattice
Loading required package: splines
Loading required package: parallel
Loaded gbm 2.1.3
Loading required package: survey
Loading required package: grid
Loading required package: Matrix

Attaching package: ‘survey’

The following object is masked from ‘package:graphics’:

    dotchart

Loading required package: xtable
Loading required package: latticeExtra
Loading required package: RColorBrewer
Loading required package: MASS
## 
##  Matching (Version 4.9-2, Build Date: 2015-12-25)
##  See http://sekhon.berkeley.edu/matching for additional documentation.
##  Please cite software as:
##   Jasjeet S. Sekhon. 2011. ``Multivariate and Propensity Score Matching
##   Software with Automated Balance Optimization: The Matching package for R.''
##   Journal of Statistical Software, 42(7): 1-52. 
##

── Attaching packages ─────────────────────────────────────── tidyverse 1

In [20]:
data_dir <- file.path("..", "data")
sql_dir <- file.path("..", "sql")

In [2]:
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "mimic")
dbSendQuery(con, "set search_path=echo,public,mimiciii;")

<PostgreSQLResult>

In [3]:
full_data <- dbGetQuery(con, "select * from merged_data")

In [4]:
dbDisconnect(con)
dbUnloadDriver(drv)

In [5]:
to_factor <- function(x) {
    res <- (x %>% as.factor %>% as.integer) - 1
    if(length(na.omit(unique(res))) <= 1) return(factor(res, levels = c(0, 1)))
    return(factor(res))
}

In [6]:
factor_vars <- full_data %>%
    names %>%
    grep("flag|abnormal|icd|sedative", ., value = TRUE) %>%
    c("gender", "first_careunit", "echo", "vent", "vaso",
      "icu_adm_weekday", "icu_adm_hour", "mort_28_day")
factor_vars

In [7]:
full_data <- full_data %>%
    mutate(echo_int = as.integer(echo)) %>%
    mutate(mort_28_day_int = as.integer(mort_28_day)) %>%
    mutate_at(factor_vars, to_factor)
full_data %>% pull(echo) %>% head

In [8]:
feature_names <- full_data %>%
    names %>%
    keep(grepl("vs|lab|icd|age|gender|weight|saps|sofa|elix_score|vent|vaso|icu_adm|sedative", .)) %>%
    discard(grepl("vs|lab", .) & grepl("flag", .) & !grepl("bnp|troponin|kinase", .)) %>%
    discard(grepl("bnp|troponin|kinase", .) & !grepl("flag", .)) %>%
    discard(grepl("min|max", .)) %>%
    discard(grepl("abnormal", .))
feature_names
length(feature_names)

In [9]:
features <- full_data %>%
    select(!!!rlang::syms(feature_names))
head(features)

age,gender,weight,saps,sofa,elix_score,vent,vaso,icu_adm_weekday,icu_adm_hour,⋯,lab_creatinine_first,lab_pco2_first,lab_bnp_flag,lab_bicarbonate_first,lab_bun_first,lab_platelet_first,lab_sodium_first,lab_chloride_first,lab_ph_first,sedative
62.67646,1,74.3,25,5,5,1,0,2,1,⋯,3.7,32.0,0,22,208,313,160,123,7.45,1
86.76186,0,,13,1,10,0,0,4,1,⋯,0.9,,0,27,17,189,139,105,,0
56.08904,1,65.0,18,5,14,1,0,6,1,⋯,0.9,32.0,0,24,15,231,144,108,7.49,0
45.91093,1,,16,9,13,0,0,3,14,⋯,0.8,30.0,0,20,19,28,134,100,7.44,0
59.38693,1,91.4,13,3,22,0,0,3,14,⋯,0.7,,0,26,7,40,138,103,,0
300.00345,0,55.0,25,5,0,0,0,6,20,⋯,0.8,,0,20,19,249,147,118,,0


In [10]:
label_name <- "echo"

In [11]:
label <- full_data %>% pull(echo)
head(label)

In [12]:
fml <- feature_names %>%
    c("echo", .) %>%
    paste(collapse = " + ") %>%
    sprintf("mort_28_day ~ %s", .)
fml

In [13]:
unweighted <- glm(as.formula(fml), data = full_data, family = binomial, na.action = na.exclude)
summary(unweighted)
exp(cbind(OR = coef(unweighted), confint(unweighted)))


Call:
glm(formula = as.formula(fml), family = binomial, data = full_data, 
    na.action = na.exclude)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1335  -0.7365  -0.4171   0.7583   2.8996  

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  5.4443648  8.7492730   0.622 0.533768    
echo1                       -0.3311306  0.1483746  -2.232 0.025633 *  
age                          0.0041990  0.0012056   3.483 0.000496 ***
gender1                      0.2742175  0.1435749   1.910 0.056143 .  
weight                      -0.0088982  0.0031475  -2.827 0.004698 ** 
saps                         0.0800412  0.0172166   4.649 3.33e-06 ***
sofa                         0.1605250  0.0259602   6.184 6.27e-10 ***
elix_score                   0.0295440  0.0114913   2.571 0.010141 *  
vent1                        0.2625683  0.2672348   0.983 0.325835    
vaso1                        0.0260530  0.1752765   0.149

Waiting for profiling to be done...


Unnamed: 0,OR,2.5 %,97.5 %
(Intercept),231.4502180,7.333704e-06,5.945866e+09
echo1,0.7181113,5.366233e-01,9.603771e-01
age,1.0042078,1.001840e+00,1.006595e+00
gender1,1.3155009,9.932485e-01,1.744381e+00
weight,0.9911413,9.849634e-01,9.972043e-01
saps,1.0833317,1.047576e+00,1.120777e+00
sofa,1.1741271,1.116359e+00,1.236045e+00
elix_score,1.0299848,1.007020e+00,1.053466e+00
vent1,1.3002652,7.700479e-01,2.197377e+00
vaso1,1.0263953,7.283564e-01,1.448777e+00


In [14]:
fml <- feature_names %>%
    paste(collapse = " + ") %>%
    sprintf("echo_int ~ %s", .)
fml

In [15]:
echo_ps_ate <- ps(as.formula(fml),
                  data = full_data,
                  interaction.depth = 2,
                  shrinkage = 0.01,
                  perm.test.iters = 0,
                  estimand = "ATE",
                  verbose = FALSE,
                  stop.method = c("es.mean", "es.max", "ks.mean", "ks.max"),
                  n.trees = 10000,
                  train.fraction = 0.8,
                  cv.folds = 3,
                  n.cores = 8)

In [16]:
pred <- echo_ps_ate$ps$es.mean.ATE
full_data <- full_data %>% mutate(ps = pred)
ROCR::performance(ROCR::prediction(pred, label), "auc")@y.values %>% first

In [17]:
ft_importance <- summary(echo_ps_ate$gbm.obj,
                         n.trees = echo_ps_ate$desc$es.mean.ATE$n.trees,
                         plot = FALSE)

In [18]:
full_data <- full_data %>%
    mutate(weight = get.weights(echo_ps_ate, stop.method = "es.mean"))

In [21]:
saveRDS(full_data, file = file.path(data_dir, "full_data_ps.rds"))

In [22]:
saveRDS(ft_importance, file = file.path(data_dir, "feature_importance.rds"))

In [25]:
primary_ipw <- glm(mort_28_day ~ echo, data = full_data,
                   weights = full_data$weight, family = binomial)
summary(primary_ipw)
exp(cbind(OR = coef(primary_ipw), confint(primary_ipw)))

“non-integer #successes in a binomial glm!”


Call:
glm(formula = mort_28_day ~ echo, family = binomial, data = full_data, 
    weights = full_data$weight)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2760  -1.0703  -0.9453   1.7287   4.5574  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.89603    0.02950 -30.375  < 2e-16 ***
echo1       -0.18066    0.04251  -4.249 2.14e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 13120  on 6161  degrees of freedom
Residual deviance: 13102  on 6160  degrees of freedom
AIC: 12757

Number of Fisher Scoring iterations: 4


Waiting for profiling to be done...
“non-integer #successes in a binomial glm!”

Unnamed: 0,OR,2.5 %,97.5 %
(Intercept),0.4081856,0.3851623,0.4323822
echo1,0.834715,0.7679424,0.9072201


In [26]:
design_echo_ps_ate <- svydesign(ids = ~ icustay_id, weights = ~ weight, data = full_data)
# design_echo_ps_ate <- svydesign(ids = ~ icustay_id, data = full_data)

In [27]:
fml <- feature_names %>%
    c(label_name, .) %>%
    paste(collapse = " + ") %>%
    sprintf("mort_28_day ~ %s", .)
fml

In [28]:
logi <- svyglm(as.formula(fml),
               family = quasibinomial,
               design = design_echo_ps_ate)

In [29]:
exp(cbind(OR = coef(logi), confint(logi)))

Unnamed: 0,OR,2.5 %,97.5 %
(Intercept),99.4463928,3.678073e-06,2.688795e+09
echo1,1.0214515,7.356940e-01,1.418202e+00
age,1.0052462,1.002777e+00,1.007721e+00
gender1,1.2392670,9.273556e-01,1.656088e+00
weight,1.3429823,1.138526e+00,1.584155e+00
saps,1.0879072,1.050794e+00,1.126332e+00
sofa,1.1817521,1.124176e+00,1.242277e+00
elix_score,1.0222765,9.980912e-01,1.047048e+00
vent1,1.5006549,8.765147e-01,2.569227e+00
vaso1,1.0233523,7.227760e-01,1.448928e+00


In [30]:
ps_matches <- Match(Y = NULL, Tr = full_data$echo_int, X = full_data$ps, M = 1,
                    estimand = "ATT", caliper = 0.1,
                    exact = FALSE, replace = FALSE)

In [31]:
summary(ps_matches)


Estimate...  0 
SE.........  0 
T-stat.....  NaN 
p.val......  NA 

Original number of observations..............  6162 
Original number of treated obs...............  3063 
Matched number of observations...............  1587 
Matched number of observations  (unweighted).  1587 

Number of obs dropped by 'exact' or 'caliper'  1476 



In [32]:
tab <- table(full_data$mort_28_day[ps_matches$index.treated],
             full_data$mort_28_day[ps_matches$index.control],
             dnn = c("Echo", "Control"))
tab

    Control
Echo   0   1
   0 834 360
   1 273 120

In [33]:
fisher.test(tab)


	Fisher's Exact Test for Count Data

data:  tab
p-value = 0.8993
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.7873103 1.3126438
sample estimates:
odds ratio 
  1.018314 


In [34]:
mcnemar.test(tab)


	McNemar's Chi-squared test with continuity correction

data:  tab
McNemar's chi-squared = 11.684, df = 1, p-value = 0.0006304
