generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
7_adjust_groups.R
152 lines (128 loc) · 5.83 KB
/
7_adjust_groups.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
library(tidyverse)
library(data.table)
options(datatable.fread.datatable=FALSE)
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# setwd('../')
incidence <- fread('output/incidence_group.csv') %>%
mutate(group = 'incidence')
prevalence <- fread('output/prevalence_group.csv') %>%
mutate(group = 'prevalence')
# Get some summary stats for number of exposed and controls
count_exposed_and_control <- function(df){
print('Number of exposed')
df %>% filter(exposed == 1) %>% nrow() %>% print()
print('Number of controls')
df %>% filter(exposed == 0) %>% nrow() %>% print()
}
print('counts for incidence group')
count_exposed_and_control(incidence)
print('counts for prevalence group')
count_exposed_and_control(prevalence)
# Temporary rbind() together for convenience
matched <- rbind(incidence, prevalence)
# For date outcome variables, add binary flag for convenience
matched <- matched %>%
mutate(cmd_outcome = ifelse(cmd_outcome_date != '2100-01-01', 1, 0),
cmd_outcome_hospital = ifelse(cmd_outcome_date_hospital != '2100-01-01', 1, 0),
smi_outcome = ifelse(smi_outcome_date != '2100-01-01', 1, 0),
smi_outcome_hospital = ifelse(smi_outcome_date_hospital != '2100-01-01', 1, 0),
self_harm_outcome = ifelse(self_harm_outcome_date != '2100-01-01', 1, 0),
self_harm_outcome_hospital = ifelse(self_harm_outcome_date_hospital != '2100-01-01', 1, 0),
other_mood_disorder_hospital_outcome = ifelse(other_mood_disorder_hospital_outcome_date != '2100-01-01', 1, 0),
other_mood_disorder_diagnosis_outcome = ifelse(other_mood_disorder_diagnosis_outcome_date != '2100-01-01', 1, 0))
# Create weights for groups
# weights for controls (1/n) where n controls in matching group
# Convert weight back to 1 for exposed people
matched <- matched %>%
group_by(group, group_id) %>%
mutate(weight = 1/(n()-1)) %>%
ungroup() %>%
mutate(weight = ifelse(exposed == 1, 1, weight))
# Split back out into the three groups
incidence <- matched %>% filter(group == 'incidence')
prevalence <- matched %>% filter(group == 'prevalence')
rm(matched)
count_outcomes <- function(df){
print('Pre-splitting history counts:')
print('cmd outcome')
df %>% pull(cmd_history) %>% table() %>% print()
print('cmd history hospital')
df %>% pull(cmd_history_hospital) %>% table() %>% print()
print('smi history')
df %>% pull(smi_history) %>% table() %>% print()
print('smi history hospital')
df %>% pull(smi_history_hospital) %>% table() %>% print()
print('self harm history')
df %>% pull(self_harm_history) %>% table() %>% print()
print('self harm history hospital')
df %>% pull(self_harm_history_hospital) %>% table() %>% print()
print('other mood disorder hospital history')
df %>% pull(other_mood_disorder_hospital_history) %>% table() %>% print()
print('other mood disorder diagnosis history')
df %>% pull(other_mood_disorder_diagnosis_history) %>% table() %>% print()
print('Pre-splitting outcome counts:')
print('cmd outcome')
df %>% pull(cmd_outcome) %>% table() %>% print()
print('cmd outcome hospital')
df %>% pull(cmd_outcome_hospital) %>% table() %>% print()
print('smi outcome')
df %>% pull(smi_outcome) %>% table() %>% print()
print('smi outcome hospital')
df %>% pull(smi_outcome_hospital) %>% table() %>% print()
print('self harm outcome')
df %>% pull(self_harm_outcome) %>% table() %>% print()
print('self harm outcome hospital')
df %>% pull(self_harm_outcome_hospital) %>% table() %>% print()
print('other mood disorder hospital history')
df %>% pull(other_mood_disorder_hospital_outcome) %>% table() %>% print()
print('other mood disorder diagnosis history')
df %>% pull(other_mood_disorder_diagnosis_outcome) %>% table() %>% print()
}
print('counting history & outcomes for incidence')
count_outcomes(incidence)
print('counting history & outcomes for prevalence')
count_outcomes(prevalence)
# At this point exacerbation group has so few outcome counts of cmd hosp,
# smi or self harm that analysis cannot be performed on this group
# For incidence and prevalence groups - group outcome into mh_outcome
# same for mental health history -> mh_history
group_outcomes_history <- function(df){
df <- df %>%
mutate(mh_outcome = pmax(cmd_outcome,
cmd_outcome_hospital,
smi_outcome,
smi_outcome_hospital,
self_harm_outcome,
self_harm_outcome_hospital,
other_mood_disorder_hospital_outcome,
other_mood_disorder_diagnosis_outcome)) %>%
mutate(mh_history = pmax(cmd_history,
cmd_history_hospital,
smi_history,
smi_history_hospital,
self_harm_history,
self_harm_history_hospital,
other_mood_disorder_hospital_history,
other_mood_disorder_diagnosis_history)) %>%
select(-cmd_outcome,
-cmd_outcome_hospital,
-smi_outcome,
-smi_outcome_hospital,
-self_harm_outcome,
-self_harm_outcome_hospital,
-other_mood_disorder_hospital_outcome,
-other_mood_disorder_diagnosis_outcome,
-cmd_history,
-cmd_history_hospital,
-smi_history,
-smi_history_hospital,
-self_harm_history,
-self_harm_history_hospital,
-other_mood_disorder_hospital_history,
-other_mood_disorder_diagnosis_history)
return(df)
}
incidence <- group_outcomes_history(incidence)
prevalence <- group_outcomes_history(prevalence)
write_csv(incidence, 'output/adjusted_incidence_group.csv')
write_csv(prevalence, 'output/adjusted_prevalence_group.csv')