generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
9_cumulative_incidence_curves.R
146 lines (109 loc) · 4.34 KB
/
9_cumulative_incidence_curves.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
library(tidyverse)
library(data.table)
options(datatable.fread.datatable=FALSE)
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# setwd('../')
# end of study period
eos_date <- as.IDate('2022-10-19')
# Load data
incidence <- fread('output/adjusted_incidence_group.csv')
prevalence <- fread('output/adjusted_prevalence_group.csv')
###############################################################################
# Create new variable with index_date
# index date for the exposed should be the date_positive (earliest date of
# positive covid tests from CIS, HES or T&T)
#
# index date for the control should be visit_date (of the matched exposed as
# we choose people who haven't been infected 14+ or - of visit date)
###############################################################################
incidence <- incidence %>%
mutate(index_date = ifelse(exposed == 1, date_positive, visit_date)) %>%
mutate(index_date = as.IDate(index_date))
prevalence <- prevalence %>%
mutate(index_date = ifelse(exposed == 1, date_positive, visit_date))%>%
mutate(index_date = as.IDate(index_date))
# Derive t for outcomes
derive_t <- function(df, drop_negative = TRUE){
df <- df %>%
mutate(min_outcome_date_mh = pmin(cmd_outcome_date, cmd_outcome_date_hospital,
smi_outcome_date, smi_outcome_date_hospital,
self_harm_outcome_date, self_harm_outcome_date_hospital,
other_mood_disorder_diagnosis_outcome_date,
other_mood_disorder_hospital_outcome_date)) %>%
mutate(min_outcome_date_mh = fifelse(min_outcome_date_mh == '2100-01-01', eos_date, min_outcome_date_mh)) %>%
mutate(t = fifelse(min_outcome_date_mh == eos_date,
end_date - index_date,
min_outcome_date_mh - index_date)) %>%
select(-min_outcome_date_mh)
if (drop_negative) {
df <- filter(df, t > 0)
}
return(df)
}
# Calculate cumulative incidence
cumulative_inc <- function(df, v){
N <- nrow(df)
cu_in <- df %>%
group_by(!!sym(v)) %>%
summarise(incidence = n()) %>%
arrange(v)
cu_sum <- cumsum(cu_in$incidence)
cu_in <- cu_in %>%
mutate(cumulative_incidence = cu_sum,
surv = (N - cumulative_incidence)/N)
return(cu_in)
}
incidence <- derive_t(incidence)
prevalence <- derive_t(prevalence)
# Rename English regions function
rename_regions_function<- function(df){
df <- df %>%
mutate(region = case_when(
gor9d == 'E12000001' ~ 'North East',
gor9d == 'E12000002' ~ 'North West',
gor9d == 'E12000003' ~ 'Yorkshire and The Humber',
gor9d == 'E12000004' ~ 'East Midlands',
gor9d == 'E12000005' ~ 'West Midlands',
gor9d == 'E12000006' ~ 'East of England',
gor9d == 'E12000007' ~ 'London',
gor9d == 'E12000008' ~ 'South East',
gor9d == 'E12000009' ~ 'South West',
TRUE ~ "Missing"))
return(df)
}
# Create age bands
age_bands_function <- function(df){
df <- df %>% mutate(
#create categories
age_groups = case_when(
age >= 16 & age <= 24 ~ "16 to 24",
age >= 25 & age <= 34 ~ "25 to 34",
age >= 35 & age <= 49 ~ "35 to 49",
age >= 50 & age <= 69 ~ "50 to 69",
age >= 70 ~ "70 and over"))
return(df)
}
# apply the region renaming function
incidence <- rename_regions_function(incidence)
prevalence <- rename_regions_function(prevalence)
# apply age band function &
# remove the old region column (gor9d)
incidence <- age_bands_function(incidence) %>% select(-gor9d)
prevalence <- age_bands_function(prevalence) %>% select(-gor9d)
# Save out data with t derived
write_csv(incidence, 'output/incidence_t.csv')
write_csv(prevalence, 'output/prevalence_t.csv')
inc_ci_cmd <- cumulative_inc(incidence, 't')
prev_ci_other <- cumulative_inc(prevalence, 't')
plot_surv <- function(df, x, y, title){
ggplot(df) +
geom_line(aes_string(x = x, y = y)) +
ggtitle(title) +
theme(plot.title = element_text(hjust = 0.5))
}
jpeg('output/incidence_surv.jpg', res = 300, width = 12, height = 10, units = 'cm')
plot_surv(inc_ci_cmd, 't', 'surv', 'Incidence')
dev.off()
jpeg('output/prevalence_surv.jpg', res = 300, width = 12, height = 10, units = 'cm')
plot_surv(prev_ci_other, 't', 'surv', 'Prevalence')
dev.off()