generated from opensafely/research-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_tests.R
105 lines (92 loc) · 3.75 KB
/
process_tests.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
################################################################################
# process tests data
library(tidyverse)
library(lubridate)
library(glue)
################################################################################
## source functions
# source(here::here("analysis", "lib", "data_properties.R"))
################################################################################
## create folders for outputs
fs::dir_create(here::here("output", "tests", "images"))
fs::dir_create(here::here("output", "tests", "tables"))
################################################################################
cat("--- read input_tests.feather ----")
data_tests_0 <- arrow::read_feather(
file = here::here("output", "input_tests.feather"))
data_eligible_e <- readr::read_csv(
here::here("output", "data", "data_eligible_e.csv"))
cat("--- process input_tests.feather ----")
data_tests <- data_tests_0 %>%
mutate(across(contains("_date"),
~ floor_date(
as.Date(.x, format="%Y-%m-%d"),
unit = "days"))) #%>%
# mutate(
# covid_test_both_elig_n = covid_test_pre_elig_n + covid_test_post_elig_n
# ) %>%
# mutate(across(starts_with("covid_test"),
# ~ cut(.x,
# breaks = c(-Inf, 0, 4, Inf),
# labels = c("0", "1-4", "5+"),
# right = TRUE,
# include.lowest = TRUE))) %>%
# select(-elig_date)
# cat("--- check categorised variables ----")
# data_tests %>% select(starts_with("covid_test")) %>% summary()
cat("--- save data_tests.rds ----")
readr::write_rds(
data_tests,
here::here("output", "data", "data_tests.rds"),
compress = "gz"
)
################################################################################
# tabulate all vars
# this is causing R to abort session- investigate
# data_properties(
# data = data_tests,
# path = file.path("output", "tests", "tables")
# )
################################################################################
# plot distibution of coviariates
cat("--- plot covariates ----")
plot_data <- data_tests %>%
select(patient_id,
covid_test_pre_elig_n,
covid_test_post_elig_n) %>%
pivot_longer(cols = -patient_id) %>%
left_join(data_eligible_e, by = "patient_id") %>%
mutate(across(arm,
factor,
levels = c("unvax", "vax"),
labels = c("unvaccinated", "vaccinated"))) %>%
mutate(across(name,
factor,
levels = c("covid_test_pre_elig_n",
"covid_test_post_elig_n"),
labels = c("pre 1st dose eligibility",
"6 weeks post 1st dose eligibility")))
# min for y axis
min_y <- plot_data %>%
group_by(arm) %>%
count() %>%
ungroup() %>%
mutate(p = 5/n) %>%
summarise(p = max(p))
x_trunc <- 10
ggplot(NULL, aes(x = value)) +
geom_bar(data = plot_data %>% filter(arm == "vaccinated"),
aes(fill = arm, y = ..count../sum(..count..)), alpha = 0.5, width = 1) +
geom_bar(data = plot_data %>% filter(arm == "unvaccinated"),
aes(fill = arm, y = ..count../sum(..count..)), alpha = 0.5, width = 1) +
scale_y_continuous(labels=scales::percent) +
facet_wrap(~ name, scales = "free") +
labs(y = "percent", x = "number of SARS-CoV-2 tests",
caption = str_c(glue("x-axis truncated at {x_trunc}, y-axis truncated at "), signif(100*min_y$p,1), "% to mask bars corresponding to < 5 individuals")) +
coord_cartesian(xlim = c(0,x_trunc), ylim = c(min_y$p, NA)) +
scale_fill_discrete(name=NULL) +
theme(legend.position = "bottom")
cat("--- save plot ----")
ggsave(
filename = here::here("output", "tests", "images", "covariate_distribution.png"),
width=15, height=20, units="cm")