generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_selection.R
134 lines (115 loc) · 4.29 KB
/
data_selection.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# # # # # # # # # # # # # # # # # # # # #
# This script:
# imports processed data
# filters out people who are excluded from the main analysis
# outputs inclusion/exclusions flowchart data
# # # # # # # # # # # # # # # # # # # # #
# Import libraries
library(tidyverse)
library(here)
library(glue)
# Select wave based on input arguments
args <- commandArgs(trailingOnly=TRUE)
if(length(args)==0){
wave <- "wave4"
} else {
wave <- args[[1]]
}
# Import processed data
data_processed <- read_rds(here::here("output", "processed", paste0("input_",wave,".rds")))
# Function fct_case_when needed inside process_data
source(here("analysis", "utils", "fct_case_when.R"))
# Define last exposure
if (wave == "wave1") {
data_processed <- data_processed %>%
mutate(time_since_last_infection = NA)
}
if (wave == "wave2") {
data_processed <- data_processed %>%
mutate(time_since_last_infection = pre_alpha_infection_days)
}
if (wave == "wave3") {
data_processed <- data_processed %>%
mutate(time_since_last_infection = pre_delta_infection_days)
}
if (wave == "wave4") {
data_processed <- data_processed %>%
mutate(time_since_last_infection = pre_omicron_infection_days)
}
# Define selection criteria
data_criteria <- data_processed %>%
transmute(
patient_id,
# Made it into into study population with valid age
study_definition = TRUE,
has_follow_up = has_follow_up==1,
has_age = !is.na(age) & age >=18 & age<=110,
has_sex = !is.na(sex),
# At least 1 ICP flag
is_ICP = organ_transplant==1 | bone_marrow_transplant==1 | haem_cancer==1 |
immunosuppression_diagnosis==1 | immunosuppression_medication==1 | radio_chemo==1,
# Demography
has_imd = !is.na(imd),
has_ethnicity = !is.na(ethnicity),
has_region = !is.na(region),
# Postvax events
severe_date_check = is.na(covid_severe_date) | covid_severe_date>omicron_start_date,
death_date_check = is.na(covid_death_date) | covid_death_date>omicron_start_date,
noncoviddeath_date_check = is.na(died_any_date) | died_any_date>omicron_start_date,
# No covid in past 90 days
no_recent_covid = is.na(time_since_last_infection) | time_since_last_infection>90,
# Define primary outcome study population
include = (
has_follow_up & has_age & has_sex &
is_ICP &
has_imd & has_ethnicity & has_region &
severe_date_check & death_date_check & noncoviddeath_date_check &
no_recent_covid
)
)
# Create cohort data including patients fulfilling selection criteria
data_filtered <- data_criteria %>%
filter(include) %>%
select(patient_id) %>%
left_join(data_processed, by="patient_id")
# Save data
output_dir <- here("output", "filtered")
fs::dir_create(output_dir)
write_rds(data_filtered, here::here("output", "filtered", paste0("input_",wave,".rds")), compress="gz")
#write_csv(data_filtered, here::here("output", "filtered", paste0("input_",wave,".csv")))
# Create flow chart
data_flowchart <- data_criteria %>%
transmute(
c0 = (study_definition & has_follow_up & has_age & has_sex),
c1 = c0 & is_ICP,
c2 = c1 & (has_imd & has_ethnicity & has_region),
c3 = c2 & (severe_date_check & death_date_check & noncoviddeath_date_check),
c4 = c3 & no_recent_covid,
) %>%
summarise(
across(.fns=sum)
) %>%
pivot_longer(
cols=everything(),
names_to="criteria",
values_to="n"
) %>%
mutate(
n_exclude = lag(n) - n,
pct_exclude = n_exclude/lag(n),
pct_all = n / first(n),
pct_step = n / lag(n),
crit = str_extract(criteria, "^c\\d+"),
criteria = fct_case_when(
crit == "c0" ~ "Males and females aged >=18 years on index date with at least 3 months of continuous registration at a single GP",
crit == "c1" ~ "Falls into at least one immunosuppression subgroup",
crit == "c2" ~ "No missing demographic information (region, index of multiple deprivation, or ethnicity)",
crit == "c3" ~ "No outcome or censoring events recorded before start of follow-up",
crit == "c4" ~ "No evidence of SARS-CoV-2 infection in 90 days before index date",
TRUE ~ NA_character_
)
)
# Save flowchart
output_dir <- here("output", "flowchart")
fs::dir_create(output_dir)
write_csv(data_flowchart, here::here("output", "flowchart", paste0("flowchart_",wave,".csv")))