generated from opensafely/research-template
/
cox_models_day0.R
378 lines (351 loc) · 14.6 KB
/
cox_models_day0.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
######################################
# Cox models (propensity score analysis)
######################################
## Import libraries
library(tidyverse)
library(lubridate)
library(survival)
library(survminer)
library(gridExtra)
library(splines)
library(survey)
library(here)
## Load functions
source(here("lib", "functions", "safely_n_quietly.R"))
## Create tables directory
fs::dir_create(here::here("output", "tables"))
## Load data
data_cohort <-
read_rds(here::here("output", "data", "data_processed_day0.rds"))
## Specify number of PS imputations
imputations_total <- 20
seeds <- runif(imputations_total * 3, min = 0, max = 1e6) %>% floor()
# create data.frame 'estimates' where output is saved
# 'estimates' has 9 columns with the imputation number and logHR and var of logHR and
# n for the number of pt in utilised dataset and how many people assigned 'treated' and
# how many people assigned 'untreated'
# primary outcome x 3 (mol + sot vs none; sot vs none; mol vs none) plus
# secondary outcome x 3
# --> number of rows is iterations x imputations_total
estimates <- matrix(nrow = 6 * imputations_total, ncol = 9) %>% as.data.frame()
# give column names
colnames(estimates) <-
c("comparison", "outcome", "imputation",
"logHR", "var",
"n_after_restriction", "n_assign_trt", "n_assign_untrt",
"seed")
# create data.frame 'log' where errors and warnings are saved
# 'log' has 3 columns: comparison, warning and error
# 'log' has number of rows identical to 'estimates'
log <- matrix(nrow = 6 * imputations_total, ncol = 5) %>% as.data.frame()
# give column names
colnames(log) <-
c("comparison", "outcome","imputation", "warning", "error")
# create data.frame 'outcomes' where number of outcomes are save
# --> number of rows is iterations x imputations_total
outcomes_summary<- matrix(nrow = 6 * imputations_total, ncol = 9) %>% as.data.frame()
# give column names
colnames(outcomes_summary) <-
c("comparison", "outcome", "imputation",
"n_outcome_day0_trt", "n_outcome_day0_untrt",
"n_outcome_day1_trt","n_outcome_day1_untrt",
"n_outcome_day2_trt","n_outcome_day2_untrt")
# Specify treated group for comparison (Treated vs Untreated)
# used to loop trough different analyses
trt_grp <- c("All", "Sotrovimab", "Molnupiravir")
# Specify outcomes
# uesd to loop through different analyses
outcomes <- c("primary", "secondary")
# Specify PS model
# Vector of variables for PS model
# Note: age modelled with cubic spline with 3 knots
vars <-
c(
"ns(age, df=3)",
"ns(study_week, df=3)",
"sex",
"ethnicity",
"imdQ5" ,
"stp",
"rural_urban",
"huntingtons_disease_nhsd" ,
"myasthenia_gravis_nhsd" ,
"motor_neurone_disease_nhsd" ,
"multiple_sclerosis_nhsd" ,
"solid_organ_transplant_nhsd",
"hiv_aids_nhsd" ,
"immunosupression_nhsd" ,
"imid_nhsd" ,
"liver_disease_nhsd",
"ckd_stage_5_nhsd",
"haematological_disease_nhsd" ,
"cancer_opensafely_snomed" ,
"downs_syndrome_nhsd",
"diabetes",
"bmi_group",
"smoking_status",
"copd",
"dialysis",
"cancer",
"lung_cancer",
"haem_cancer",
"vaccination_status",
"pfizer_most_recent_cov_vac",
"az_most_recent_cov_vac",
"moderna_most_recent_cov_vac"
)
# Specify model
psModelFunction <- as.formula(
paste("treatment",
paste(vars, collapse = " + "),
sep = " ~ "))
for (j in seq_along(outcomes)){
outcome_event <- outcomes[j]
cat(paste0("\n#### ", outcome_event, " outcome ####\n"))
# find location in matrix
index <- length(trt_grp) * imputations_total * (j - 1) + 1
# Select treatment var used in analysis ---
if (outcome_event == "primary"){
data_cohort_trt <-
data_cohort %>%
mutate(treatment_strategy_cat = treatment_strategy_cat_day0_prim,
treatment = treatment_day0_prim,
treatment_date = treatment_date_day0_prim)
} else if (outcome_event == "secondary"){
data_cohort_trt <-
data_cohort %>%
mutate(treatment_strategy_cat = treatment_strategy_cat_day0_sec,
treatment = treatment_day0_sec,
treatment_date = treatment_date_day0_sec)
}
for(i in seq_along(trt_grp)) {
# Some preperations ---
# select treatment
t <- trt_grp[i]
# identify location in matrix where estimates are saved
location <- index + imputations_total * (i - 1)
# Select data used in analysis ---
# Create data_cohort_sub (data.frame used for the day 0 analysis)
if (t == "All") {
cat("\n#### All Treated versus Untreated Comparison ####\n")
data_cohort_sub <- data_cohort_trt
} else { # (mol/sot are excluded in single comparison analyses)
cat(paste0("\n#### ", t, " versus Untreated Comparison ####\n"))
# Drop patients treated with molnupiravir or sotrovimab
data_cohort_sub <-
data_cohort_trt %>%
filter(treatment_strategy_cat %in% c("Untreated", t))
}
# Create data_ps_sub (data.frame used to fit propensity score model on)
# --> ps model is fitted in day 5 cohort
# Subset cohort to those who survive until day 5 to be used for PS estimation
data_ps_sub <-
data_cohort_sub %>%
filter(fu_secondary > 4)
# Fit Propensity Score Model ---
# Fit PS model on data_ps_sub
psModel <- glm(psModelFunction,
family = binomial(link = "logit"),
data = data_ps_sub)
summary(psModel) %>% coefficients()
# Add PS to day 0 cohort ---
# Identify patients for who treatment will be imputed
# (identical across all data_cohort_sub as number of untreated people is not
# changing)
patients_imputed <-
data_cohort_sub %>%
filter(treatment == "Untreated" & fu_secondary <= 4)
cat("#### No. of patients for who treatment will be imputed ####\n")
print(nrow(patients_imputed))
# Append patient-level predicted probability of being assigned to cohort
# Predict PS for members of Day 0 cohort from model fitted to those who survived to Day 5
data_cohort_sub$pscore <-
predict(psModel, type = "response", newdata = data_cohort_sub)
# Impute treatment ---
# For 'untreated' patients at time of outcome, assign:
# 'Treated' if pscore > rand
# 'Untreated' if otherwise
for(m in 1:imputations_total) {
# save treatment var and outcome for reference
estimates[location + (m - 1), "comparison"] <- t
estimates[location + (m - 1), "outcome"] <- outcome_event
log[location + (m - 1), "comparison"] <- t
log[location + (m - 1), "outcome"] <- outcome_event
outcomes_summary[location + (m - 1), "comparison"] <- t
outcomes_summary[location + (m - 1), "outcome"] <- outcome_event
# Preperations ---
cat(paste0("#### Imputation ", m, " ####\n"))
estimates[location + (m - 1), "imputation"] <- m
log[location + (m - 1), "imputation"] <- m
outcomes_summary[location + (m - 1), "imputation"] <- m
# Reassign new data_cohort_sub dataset each imputation loop
data_cohort_sub_imp <- data_cohort_sub
# Generate random probability from uniform distribution
seed_index <- (i - 1) * imputations_total + m
set.seed(seeds[seed_index])
data_cohort_sub_imp$rand <- runif(nrow(data_cohort_sub_imp), 0, 1)
estimates[location + (m - 1), "seed"] <- seeds[seed_index]
log[location + (m - 1), "seed"] <- seeds[seed_index]
# Impute "Treated" if pscore > rand ---
data_cohort_sub_imp <-
data_cohort_sub_imp %>%
mutate(treatment2 = case_when(
(treatment == "Treated") |
(treatment == "Untreated" & fu_secondary <= 4 &
pscore > rand) ~ "Treated",
TRUE ~ "Untreated") %>% factor(levels = c("Untreated", "Treated")))
# Identify how many patients are assigned to each treatment group
trt_count <-
data_cohort_sub_imp %>%
arrange(treatment) %>%
count(treatment, treatment2)
# Save count in estimates data.frame
estimates[location + (m - 1), "n_assign_trt"] <-
trt_count[2, 3]
estimates[location + (m - 1), "n_assign_untrt"] <-
nrow(patients_imputed) - trt_count[2, 3]
cat("#### No. of patients for who treatment is imputed ####\n")
print(paste0("Treated: ", trt_count[2, 3],
"; Untreated: ", nrow(patients_imputed) - trt_count[2, 3]))
# Derive inverse probability of treatment weights (IPTW) ---
data_cohort_sub_imp$weights <-
ifelse(data_cohort_sub_imp$treatment2 == "Treated",
1 / data_cohort_sub_imp$pscore,
1 / (1 - data_cohort_sub_imp$pscore))
# Check overlap PS ----
# Identify lowest and highest propensity score in each group
ps_trim <- data_cohort_sub_imp %>%
select(treatment2, pscore) %>%
group_by(treatment2) %>%
summarise(min = min(pscore), max= max(pscore)) %>%
ungroup() %>%
summarise(min = max(min), max = min(max)) # see below for why max of min and min of max is taken
# Restricted to observations within a PS range common to both treated and untreated persons—
# (i.e. exclude all patients in the nonoverlapping parts of the PS distribution)
data_cohort_sub_imp <-
data_cohort_sub_imp %>%
filter(pscore >= ps_trim$min[1] & pscore <= ps_trim$max[1])
estimates[location + (m - 1), "n_after_restriction"] <- nrow(data_cohort_sub_imp)
#save outcomes
if (outcome_event == "primary"){
outcome_dat <- data_cohort_sub_imp %>%
filter(status_primary == "covid_hosp_death") %>%
mutate(fu_primary = ifelse(fu_primary >= 2, 2, fu_primary)) %>%
group_by(treatment2, fu_primary) %>%
count() %>%
ungroup()
} else if (outcome_event == "secondary"){
outcome_dat <- data_cohort_sub_imp %>%
filter(status_primary == "allcause_hosp_death") %>%
mutate(fu_primary = ifelse(fu_primary >= 2, 2, fu_primary)) %>%
group_by(treatment2, fu_primary) %>%
count() %>%
ungroup()
}
outcomes_summary[location + (m - 1), "n_outcome_day0_untrt"] <- outcome_dat %>% filter(treatment2 == "Untreated" & fu_primary == 0) %>% select(n) %>% as.numeric()
outcomes_summary[location + (m - 1), "n_outcome_day1_untrt"] <- outcome_dat %>% filter(treatment2 == "Untreated" & fu_primary == 1) %>% select(n) %>% as.numeric()
outcomes_summary[location + (m - 1), "n_outcome_day2_untrt"] <- outcome_dat %>% filter(treatment2 == "Untreated" & fu_primary == 2) %>% select(n) %>% as.numeric()
outcomes_summary[location + (m - 1), "n_outcome_day0_trt"] <- outcome_dat %>% filter(treatment2 == "Treated" & fu_primary == 0) %>% select(n) %>% as.numeric()
outcomes_summary[location + (m - 1), "n_outcome_day1_trt"] <- outcome_dat %>% filter(treatment2 == "Treated" & fu_primary == 1) %>% select(n) %>% as.numeric()
outcomes_summary[location + (m - 1), "n_outcome_day2_trt"] <- outcome_dat %>% filter(treatment2 == "Treated" & fu_primary == 2) %>% select(n) %>% as.numeric()
# Fit outcome model ---
## Define svy design for IPTW
iptw <- svydesign(ids = ~ 1, data = data_cohort_sub_imp, weights = ~ weights)
# create formula for primary and secondary analysis
if (outcome_event == "primary"){
formula <-
as.formula(
Surv(fu_primary, status_primary == "covid_hosp_death") ~
treatment2)
} else if (outcome_event == "secondary"){
formula <-
as.formula(
Surv(fu_secondary, status_secondary == "allcause_hosp_death") ~
treatment2)
}
# Cox regression
# returns function model_PSw() with components result, output, messages,
# warnings and error
model_PSw <-
safely_n_quietly(
.f = ~ svycoxph(formula,
design = iptw,
data = data_cohort_sub_imp)
)
# save results from model_PSw and save warnings or errors to log file
if (is.null(model_PSw()$error)){
# no error
log[location + (m - 1),"error"] <- NA_character_
if (length(model_PSw()$warnings) != 0){
# save warning in log
log[location + (m - 1), "warning"] <- paste(model_PSw()$warnings, collapse = '; ')
} else log[location + (m - 1), "warning"] <- NA_character_
# select main effects from 'model_PSw'
selection <-
model_PSw()$result$coefficients %>% names %>% startsWith("treatment2")
# save coefficients of model and var in estimates
estimates[location + (m - 1), "logHR"] <-
model_PSw()$result$coefficients[selection]
estimates[location + (m - 1), c("var")] <-
model_PSw()$result$var[selection] # Gives the robust variance
# error
} else log[location + (m - 1), "error"] <- model_PSw()$messages
}
}
}
## Loop over analysis for each treatment comparison
## Rubin's Rules---
# Point estimate mean & within imputation variance
rubins_rules <-
estimates %>%
group_by(comparison, outcome) %>%
summarise(mean = mean(logHR),
var_w = mean(var),
.groups = "keep")
# Between imputation variance
var_b <- estimates %>%
left_join(select(rubins_rules, mean, comparison, outcome),
by = c("comparison", "outcome")) %>%
mutate(diff = (logHR - mean) ^ 2) %>%
group_by(comparison, outcome) %>%
summarise(sum_diff = sum(diff),
.groups = "keep") %>%
mutate(var_b = sum_diff / (imputations_total - 1)) %>%
select(-c(sum_diff))
# Rubins rules + construct CI
rubins_rules <-
rubins_rules %>%
left_join(var_b,
by = c("comparison", "outcome")) %>%
mutate(
total_var = var_w + var_b + (var_b / imputations_total),
se_pooled = sqrt(total_var),
lci = (mean - (1.96 * se_pooled)) %>% exp(),
uci = (mean + (1.96 * se_pooled)) %>% exp(),
hr = exp(mean)
) %>%
select(comparison, outcome, hr, lci, uci)
## Save output ---
write_csv(rubins_rules,
here("output",
"tables",
paste0("cox_models_day0.csv")))
write_rds(rubins_rules,
here("output",
"tables",
paste0("cox_models_day0.rds")))
# save original estimates
write_csv(estimates,
here("output",
"tables",
paste0("estimates_day0.csv")))
# save original outcomes
write_csv(outcomes_summary,
here("output",
"tables",
paste0("outcomes_day0.csv")))
# save log file
write_csv(log,
here("output",
"tables",
paste0("log_cox_models_day0.csv")))