generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 1
/
6_6_pre_post_match_testing.R
172 lines (117 loc) · 5.16 KB
/
6_6_pre_post_match_testing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# checking the differences in the exposed incidence group between
# pre-matched exposed groups and post-matched exposed group
library(tidyverse)
library(data.table)
options(datatable.fread.datatable=FALSE)
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
# setwd('../')
# Read in exposed population
################################################################################
# Load data for exposed and control population
################################################################################
# Read in exposed population
exposed <- fread('output/cis_exposed.csv') %>%
mutate(exposed = 1)
# Read in control population
control <- fread('output/cis_control.csv') %>%
mutate(exposed = 0)
# Temporary rbind() while type of outcome is determined
population <- rbind(exposed, control)
################################################################################
# Create different outcome groups per exposed and control population
# 2 types of outcome
################################################################################
### (1) Incidence group (new onset) ###
# No history of mental illness
incidence <- population %>%
mutate(mh_history = ifelse(cmd_history == 1 | cmd_history_hospital == 1 |
smi_history == 1 | smi_history_hospital == 1 |
other_mood_disorder_diagnosis_history == 1 | other_mood_disorder_hospital_history == 1 |
self_harm_history == 1 | self_harm_history_hospital == 1, 1, 0)) %>%
filter(mh_history == 0) %>%
select(-mh_history)
incidence_pre_exposed <- incidence %>%
filter(exposed == 1) %>% select(patient_id,
visit_date,
date_positive,
end_date,
exposed,
result_mk,
visit_num,
last_linkage_dt,
is_opted_out_of_nhs_data_share)
incidence_control <- incidence %>%
filter(exposed == 0)
rm(exposed, control, population, incidence)
gc()
# now load post-matching data
incidencee_post_exposed <- fread('output/incidence_group.csv') %>%
select(patient_id,
visit_date,
date_positive,
end_date,
exposed,
group_id,
result_mk,
visit_num,
last_linkage_dt,
is_opted_out_of_nhs_data_share) %>%
filter(exposed== 1)
# now we want to anti join to find unmatched records
#incidence_pre_exposed
#incidencee_post_exposed
unmatched_records <- anti_join(incidence_pre_exposed, incidencee_post_exposed, by="patient_id") %>%
arrange(date_positive)
# number of rows
print('Number of unmatched exposed population - incidence')
nrow(unmatched_records)
# dates - visit dates
print('Summary of the index date (date_positive) variable')
summary(unmatched_records$date_positive)
#print all dates in order
print('index dates in the unmatched in order')
dates_exposed<- data.frame(unmatched_records$date_positive)
print('Count of index dates by year')
unmatched_records %>% group_by(year=year(visit_date)) %>% count()
print('Count of index dates by month and year')
unmatched_records %>% group_by(year=year(visit_date), month=month(visit_date)) %>% count()
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
################################################################################
# check the remaining control population to see if there are no more possible matches
#incidence_control
incidencee_post_control <- fread('output/incidence_group.csv') %>%
select(patient_id,
visit_date,
date_positive,
end_date,
exposed,
group_id,
result_mk,
visit_num,
last_linkage_dt,
is_opted_out_of_nhs_data_share) %>%
filter(exposed == 0)
# now we want to anti join to find unmatched records
#incidence_pre_exposed
#incidencee_post_exposed
unmatched_records <- anti_join(incidence_control, incidencee_post_control, by="patient_id") %>%
arrange(date_positive)
# number of rows
print('Number of unmatched control population - incidence')
nrow(unmatched_records)
# dates - visit dates
print('Summary of the index date (date_positive) variable-control')
summary(unmatched_records$date_positive)
#print all dates in order
print('index dates in the unmatched in order-control')
dates_control<- data.frame(unmatched_records$date_positive)
print('Count of index dates by year-control')
unmatched_records %>% group_by(year=year(visit_date)) %>% count()
print('Count of index dates by month and year-control')
unmatched_records %>% group_by(year=year(visit_date), month=month(visit_date)) %>% count()
write_csv(dates_control, 'output/dates_order_control.csv')
write_csv(dates_exposed, 'output/dates_order_exposed.csv')