In [1]:
library(RPostgreSQL)
library(twang)
library(Matching)
library(tidyverse)

Loading required package: DBI
Loading required package: gbm
Loading required package: survival
Loading required package: lattice
Loading required package: splines
Loading required package: parallel
Loaded gbm 2.1.3
Loading required package: survey
Loading required package: grid
Loading required package: Matrix

Attaching package: ‘survey’

The following object is masked from ‘package:graphics’:

    dotchart

Loading required package: xtable
Loading required package: latticeExtra
Loading required package: RColorBrewer
Loading required package: MASS
## 
##  Matching (Version 4.9-2, Build Date: 2015-12-25)
##  See http://sekhon.berkeley.edu/matching for additional documentation.
##  Please cite software as:
##   Jasjeet S. Sekhon. 2011. ``Multivariate and Propensity Score Matching
##   Software with Automated Balance Optimization: The Matching package for R.''
##   Journal of Statistical Software, 42(7): 1-52. 
##

── Attaching packages ─────────────────────────────────────── tidyverse 1

In [2]:
data_dir <- file.path("..", "data")
sql_dir <- file.path("..", "sql")

In [3]:
drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "mimic")
dbSendQuery(con, "set search_path=echo,public,mimiciii;")

<PostgreSQLResult>

In [4]:
full_data <- dbGetQuery(con, "select * from merged_data")

In [5]:
dbDisconnect(con)
dbUnloadDriver(drv)

In [6]:
to_factor <- function(x) {
    res <- (x %>% as.factor %>% as.integer) - 1
    if(length(na.omit(unique(res))) <= 1) return(factor(res, levels = c(0, 1)))
    return(factor(res))
}

In [7]:
factor_vars <- full_data %>%
    names %>%
    grep("flag|abnormal|icd|sedative", ., value = TRUE) %>%
    c("gender", "first_careunit", "echo", "vent", "vaso",
      "icu_adm_weekday", "icu_adm_hour", "mort_28_day")
factor_vars

In [8]:
full_data <- full_data %>%
    mutate(echo_int = as.integer(echo)) %>%
    mutate(mort_28_day_int = as.integer(mort_28_day)) %>%
    mutate_at(factor_vars, to_factor)
full_data %>% pull(echo) %>% head

In [9]:
feature_names <- full_data %>%
    names %>%
    keep(grepl("vs|lab|icd|age|gender|weight|saps|sofa|elix_score|vent|vaso|icu_adm|sedative", .)) %>%
    discard(grepl("vs|lab", .) & grepl("flag", .) & !grepl("bnp|troponin|kinase", .)) %>%
    discard(grepl("bnp|troponin|kinase", .) & !grepl("flag", .)) %>%
    discard(grepl("min|max", .)) %>%
    discard(grepl("abnormal", .))
feature_names
length(feature_names)

In [10]:
features <- full_data %>%
    select(!!!rlang::syms(feature_names))
head(features)

age,gender,weight,saps,sofa,elix_score,vent,vaso,icu_adm_weekday,icu_adm_hour,⋯,lab_creatinine_first,lab_pco2_first,lab_bnp_flag,lab_bicarbonate_first,lab_bun_first,lab_platelet_first,lab_sodium_first,lab_chloride_first,lab_ph_first,sedative
62.67646,1,74.3,25,5,5,1,0,2,1,⋯,3.7,32.0,0,22,208,313,160,123,7.45,1
86.76186,0,,13,1,10,0,0,4,1,⋯,0.9,,0,27,17,189,139,105,,0
56.08904,1,65.0,18,5,14,1,0,6,1,⋯,0.9,32.0,0,24,15,231,144,108,7.49,0
45.91093,1,,16,9,13,0,0,3,14,⋯,0.8,30.0,0,20,19,28,134,100,7.44,0
59.38693,1,91.4,13,3,22,0,0,3,14,⋯,0.7,,0,26,7,40,138,103,,0
91.5,0,55.0,25,5,0,0,0,6,20,⋯,0.8,,0,20,19,249,147,118,,0


In [11]:
label_name <- "echo"

In [12]:
label <- full_data %>% pull(echo)
head(label)

In [13]:
fml <- feature_names %>%
    c("echo", .) %>%
    paste(collapse = " + ") %>%
    sprintf("mort_28_day ~ %s", .)
fml

In [14]:
unweighted <- glm(as.formula(fml), data = full_data, family = binomial, na.action = na.exclude)
summary(unweighted)
exp(cbind(OR = coef(unweighted), confint(unweighted)))


Call:
glm(formula = as.formula(fml), family = binomial, data = full_data, 
    na.action = na.exclude)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.1250  -0.7347  -0.4077   0.7494   2.8886  

Coefficients:
                              Estimate Std. Error z value Pr(>|z|)    
(Intercept)                  8.2731508  8.8177268   0.938 0.348121    
echo1                       -0.2987650  0.1489311  -2.006 0.044850 *  
age                          0.0241281  0.0056105   4.301  1.7e-05 ***
gender1                      0.2641650  0.1438090   1.837 0.066222 .  
weight                      -0.0080908  0.0031720  -2.551 0.010751 *  
saps                         0.0632920  0.0179328   3.529 0.000417 ***
sofa                         0.1708259  0.0263574   6.481  9.1e-11 ***
elix_score                   0.0293990  0.0115937   2.536 0.011220 *  
vent1                        0.3336840  0.2675541   1.247 0.212337    
vaso1                       -0.0079091  0.1762429  -0.045

Waiting for profiling to be done...


Unnamed: 0,OR,2.5 %,97.5 %
(Intercept),3917.2721313,0.0001096094,1.162089e+11
echo1,0.7417337,0.5537001114,9.931019e-01
age,1.0244215,1.0133309067,1.035883e+00
gender1,1.3023431,0.9828208020,1.727652e+00
weight,0.9919419,0.9857148521,9.980611e-01
saps,1.0653379,1.0286705483,1.103648e+00
sofa,1.1862842,1.1270799972,1.249862e+00
elix_score,1.0298355,1.0066711605,1.053518e+00
vent1,1.3961019,0.8261955832,2.360505e+00
vaso1,0.9921221,0.7025945135,1.402833e+00


In [15]:
fml <- feature_names %>%
    paste(collapse = " + ") %>%
    sprintf("echo_int ~ %s", .)
fml

In [16]:
echo_ps_ate <- ps(as.formula(fml),
                  data = full_data,
                  interaction.depth = 2,
                  shrinkage = 0.01,
                  perm.test.iters = 0,
                  estimand = "ATE",
                  verbose = FALSE,
                  stop.method = c("es.mean", "es.max", "ks.mean", "ks.max"),
                  n.trees = 10000,
                  train.fraction = 0.8,
                  cv.folds = 3,
                  n.cores = 8)

In [17]:
pred <- echo_ps_ate$ps$es.mean.ATE
full_data <- full_data %>% mutate(ps = pred)
ROCR::performance(ROCR::prediction(pred, label), "auc")@y.values %>% first

In [18]:
ft_importance <- summary(echo_ps_ate$gbm.obj,
                         n.trees = echo_ps_ate$desc$es.mean.ATE$n.trees,
                         plot = FALSE)

In [19]:
full_data <- full_data %>%
    mutate(ps_weight = get.weights(echo_ps_ate, stop.method = "es.mean"))

In [20]:
saveRDS(full_data, file = file.path(data_dir, "full_data_ps.rds"))

In [21]:
saveRDS(ft_importance, file = file.path(data_dir, "feature_importance.rds"))

In [22]:
primary_ipw <- glm(mort_28_day ~ echo, data = full_data,
                   weights = full_data$ps_weight, family = binomial)
summary(primary_ipw)
exp(cbind(OR = coef(primary_ipw), confint(primary_ipw)))

“non-integer #successes in a binomial glm!”


Call:
glm(formula = mort_28_day ~ echo, family = binomial, data = full_data, 
    weights = full_data$ps_weight)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.2889  -1.0691  -0.9428   1.7193   4.6141  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.89663    0.02956 -30.335  < 2e-16 ***
echo1       -0.18030    0.04259  -4.233 2.31e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 13072  on 6161  degrees of freedom
Residual deviance: 13054  on 6160  degrees of freedom
AIC: 12688

Number of Fisher Scoring iterations: 4


Waiting for profiling to be done...
“non-integer #successes in a binomial glm!”

Unnamed: 0,OR,2.5 %,97.5 %
(Intercept),0.4079409,0.3848867,0.4321723
echo1,0.8350238,0.768107,0.907697


In [23]:
design_echo_ps_ate <- svydesign(ids = ~ icustay_id, weights = ~ ps_weight, data = full_data)
# design_echo_ps_ate <- svydesign(ids = ~ icustay_id, data = full_data)

In [24]:
fml <- feature_names %>%
    c(label_name, .) %>%
    paste(collapse = " + ") %>%
    sprintf("mort_28_day ~ %s", .)
fml

In [25]:
logi <- svyglm(as.formula(fml),
               family = quasibinomial,
               design = design_echo_ps_ate)

In [26]:
summary(logi)


Call:
svyglm(formula = as.formula(fml), family = quasibinomial, design = design_echo_ps_ate)

Survey design:
svydesign(ids = ~icustay_id, weights = ~ps_weight, data = full_data)

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 13.0277014  9.2480994   1.409 0.159139    
echo1                       -0.3358510  0.1519932  -2.210 0.027284 *  
age                          0.0240723  0.0062279   3.865 0.000116 ***
gender1                      0.2022247  0.1568407   1.289 0.197475    
weight                      -0.0097747  0.0034972  -2.795 0.005257 ** 
saps                         0.0544894  0.0193176   2.821 0.004856 ** 
sofa                         0.1924874  0.0273976   7.026 3.25e-12 ***
elix_score                   0.0255830  0.0123188   2.077 0.037999 *  
vent1                        0.5547693  0.2896602   1.915 0.055656 .  
vaso1                       -0.0448572  0.1860674  -0.241 0.809526    
icu_adm_weekday1         

In [27]:
exp(cbind(OR = coef(logi), confint(logi)))

Unnamed: 0,OR,2.5 %,97.5 %
(Intercept),4.548402e+05,0.006107484,3.387313e+13
echo1,7.147296e-01,0.530597121,9.627613e-01
age,1.024364e+00,1.011936442,1.036945e+00
gender1,1.224123e+00,0.900164795,1.664670e+00
weight,9.902730e-01,0.983508516,9.970839e-01
saps,1.056001e+00,1.016766673,1.096750e+00
sofa,1.212261e+00,1.148881787,1.279137e+00
elix_score,1.025913e+00,1.001439622,1.050985e+00
vent1,1.741539e+00,0.987129314,3.072504e+00
vaso1,9.561340e-01,0.663953880,1.376891e+00


In [28]:
ps_matches <- Match(Y = NULL, Tr = full_data$echo_int, X = full_data$ps, M = 1,
                    estimand = "ATT", caliper = 0.1,
                    exact = FALSE, replace = FALSE)

In [29]:
summary(ps_matches)


Estimate...  0 
SE.........  0 
T-stat.....  NaN 
p.val......  NA 

Original number of observations..............  6162 
Original number of treated obs...............  3063 
Matched number of observations...............  1574 
Matched number of observations  (unweighted).  1574 

Number of obs dropped by 'exact' or 'caliper'  1489 



In [30]:
tab <- table(full_data$mort_28_day[ps_matches$index.treated],
             full_data$mort_28_day[ps_matches$index.control],
             dnn = c("Echo", "Control"))
tab

    Control
Echo   0   1
   0 821 361
   1 282 110

In [31]:
fisher.test(tab)


	Fisher's Exact Test for Count Data

data:  tab
p-value = 0.3732
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.6819901 1.1493002
sample estimates:
odds ratio 
 0.8871504 


In [32]:
mcnemar.test(tab)


	McNemar's Chi-squared test with continuity correction

data:  tab
McNemar's chi-squared = 9.4619, df = 1, p-value = 0.002098
