generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
6_create_controls.R
232 lines (180 loc) · 7.64 KB
/
6_create_controls.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
library(tidyverse)
library(data.table)
options(datatable.fread.datatable=FALSE)
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# setwd('../')
################################################################################
# Load data for exposed and control population
################################################################################
# Read in exposed population
exposed <- fread('output/cis_exposed.csv') %>%
mutate(exposed = 1)
# Read in control population
control <- fread('output/cis_control.csv') %>%
mutate(exposed = 0)
# Temporary rbind() while type of outcome is determined
population <- rbind(exposed, control)
################################################################################
# Create different outcome groups per exposed and control population
# 2 types of outcome
################################################################################
### (1) Incidence group (new onset) ###
# No history of mental illness
incidence <- population %>%
mutate(mh_history = ifelse(cmd_history == 1 | cmd_history_hospital == 1 |
smi_history == 1 | smi_history_hospital == 1 |
other_mood_disorder_diagnosis_history == 1 | other_mood_disorder_hospital_history == 1 |
self_harm_history == 1 | self_harm_history_hospital == 1, 1, 0)) %>%
filter(mh_history == 0) %>%
select(-mh_history)
incidence_exposed <- incidence %>%
filter(exposed == 1)
incidence_control <- incidence %>%
filter(exposed == 0)
### (2) Prevalence group ###
# Needs to have some form of MH history
prevalence <- population %>%
mutate(mh_history = ifelse(cmd_history == 1 | cmd_history_hospital == 1 |
smi_history == 1 | smi_history_hospital == 1 |
other_mood_disorder_diagnosis_history == 1 | other_mood_disorder_hospital_history == 1 |
self_harm_history == 1 | self_harm_history_hospital == 1, 1, 0)) %>%
filter(mh_history == 1) %>%
select(-mh_history)
prevalence_exposed <- prevalence %>%
filter(exposed == 1)
prevalence_control <- prevalence %>%
filter(exposed == 0)
# Clean up memory
rm(exposed, control, population, incidence, prevalence)
gc()
# Print information to log
print('size of pre matched incidence controls')
nrow(incidence_control)
print('size of pre matched incidence exposed')
nrow(incidence_exposed)
print('size of pre matched prevalence controls')
nrow(prevalence_control)
print('size of pre matched prevalence exposed')
nrow(prevalence_exposed)
################################################################################
# FUNCTION TO PERFORM MATCHING
# One to many function (5)
#
################################################################################
# Function to perform matching
match_exposed_to_controls <- function(exposed, control, N){
# Randomly order exposed population and initialise group id
exposed <- slice_sample(exposed, n=nrow(exposed), replace=FALSE) %>%
mutate(group_id = -1)
# Initialise group_id in full control population
control <- control %>%
mutate(group_id = -1,
visit_date_flag = 0)
# Make copy of controls that can be reduced iteratively
# Initialise flag for already used as controls
control_reduced <- control %>%
mutate(already_used = 0)
# Run the matching
for (i in 1:nrow(exposed)){
if (i %% 10 == 0){
print(i)
}
row <- exposed[i, ]
# Get participant id of exposed person
id_pos_exposed <- row$patient_id[1]
# Get end date of exposed person
end_date_exposed <- row$end_date[1]
# Get min and max dates based on date_positive
date_pos_exposed <- row$date_positive[1]
visit_date_min <- as.IDate(as.Date(date_pos_exposed) - 14)
visit_date_max <- as.IDate(as.Date(date_pos_exposed) + 14)
# Filter CIS based on min and max dates,
# negative visit, and whether they are already a
# control for someone else
temp <- control_reduced %>%
filter(result_mk == 0,
visit_date >= visit_date_min,
visit_date <= visit_date_max) %>%
select(-result_mk)
# Add in logic to ensure that no one can be their
# own control case
temp <- temp %>%
filter(patient_id != id_pos_exposed)
# Remove anyone with evidence of infection
# across all sources prior to exposed infection date
temp <- temp %>%
filter(date_positive > date_pos_exposed)
# If no controls, move on to next exposed person
if (nrow(temp) == 0){
# print('No controls found')
next
}
# SELECT N PEOPLE FIRST FROM VISIT LEVEL DATA, THEN GET CLOSEST VISIT
# DATE PER PERSON
control_ids <- unique(temp$patient_id)
if (length(control_ids) < N){
# pass
}
else{
control_ids <- sample(control_ids, N, replace = FALSE)
}
temp <- temp %>%
filter(patient_id %in% control_ids)
# Convert control to person level
# Take visit closest to +ve case in exposed group
# Also handle multiple visits on same day per person
temp <- temp %>%
mutate(row_id = 1:nrow(temp),
t_to_origin = abs(as.numeric(as.Date(date_pos_exposed) - as.Date(visit_date)))) %>%
group_by(patient_id) %>%
filter(t_to_origin == min(t_to_origin)) %>%
filter(row_id == min(row_id)) %>%
select(-t_to_origin, -row_id) %>%
ungroup()
# Assign group id to exposed person
exposed <- exposed %>%
mutate(group_id = ifelse(patient_id %in% id_pos_exposed, i, group_id))
# Remove the selected control(s) from the population
# (cannot be a control for someone else)
control_reduced <- control_reduced %>%
mutate(already_used = ifelse(patient_id %in% control_ids, 1, already_used))
# Assign group id to controls
control <- control %>%
mutate(group_id = ifelse(patient_id %in% control_ids, i, group_id))
# Adjust visit date flag so we know which row to take from visit level controls
for (i in 1:nrow(temp)){
id <- temp$patient_id[i]
v_date <- temp$visit_date[i]
control <- control %>%
mutate(visit_date_flag = ifelse(patient_id == id & visit_date == v_date, 1, visit_date_flag)) %>%
mutate(date_positive = ifelse(patient_id == id & visit_date == v_date, date_pos_exposed, date_positive)) %>%
#mutate(end_date = ifelse(patient_id == id & visit_date == v_date, end_date_exposed, end_date)) %>%
mutate(date_positive = as.IDate(date_positive)) #%>%
#(end_date = as.IDate(end_date))
}
# Remove from visit level population
control_reduced <- control_reduced %>%
filter(already_used == 0)
}
# Assemble final matched dataset
control <- control %>%
filter(visit_date_flag == 1) %>%
select(-visit_date_flag)
groups <- rbind(control, exposed) %>%
filter(group_id != -1)
return(groups)
}
# Run matching
incidence_group <- match_exposed_to_controls(incidence_exposed, incidence_control, 5)
prevalence_group <- match_exposed_to_controls(prevalence_exposed, prevalence_control, 5)
print('total size of post matched incidence population')
nrow(incidence_group)
print('total size of post matched prevalence population')
nrow(prevalence_group)
print('total size of post matched incidence population exposed')
incidence_group %>% filter(exposed== 1) %>% nrow()
print('total size of post matched prevalence population exposed')
prevalence_group %>% filter(exposed== 1) %>% nrow()
# Save groups
write_csv(incidence_group, 'output/incidence_group.csv')
write_csv(prevalence_group, 'output/prevalence_group.csv')