generated from opensafely/covid-vaccine-research-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_properties.R
109 lines (79 loc) · 2.66 KB
/
data_properties.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#################
# This script takes a dataset, summarises the variables using:
# * skimr::skim(),
# * class(),
# * and ,
# and saves the output to a .txt file
# The script should only be run via an action in the project.yaml only
# The script must be accompanied by two arguments
# The first is the dataset, saved as an .rds file, that is to be summarised
# The second in the directory where the txt output will be saved
#################
# import libraries
library('tidyverse')
source(here::here("lib", "redaction_functions.R"))
# import command-line arguments ----
args <- commandArgs(trailingOnly=TRUE)
rds_file <- args[[1]]
output_dir <- args[[2]]
# rds_file <- "output/data/data_vaccinated.rds"
# output_dir <- "output/data_properties"
stopifnot("must pass an .rds file" = fs::path_ext(rds_file)=="rds")
filenamebase <- fs::path_ext_remove(fs::path_file(rds_file))
# Import processed data ----
data <- readr::read_rds(here::here(rds_file))
# Output summary .txt ----
options(width=200) # set output width for capture.output
dir.create(here::here(output_dir), showWarnings = FALSE, recursive=TRUE)
## high-level variable overview ----
capture.output(
skimr::skim_without_charts(data),
file = here::here(output_dir, paste0(filenamebase, "_skim", ".txt")),
split=FALSE
)
## list of column types ----
capture.output(
lapply(data, class),
file = here::here(output_dir, paste0(filenamebase, "_coltypes", ".txt"))
)
## tabulated data ----
# delete file if it exists
if(file.exists(here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")))){
file.remove(here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")))
}
### categorical and logical ----
sumtabs_cat <-
data %>%
select(-ends_with("_id")) %>%
select(where(is.character), where(is.logical), where(is.factor)) %>%
map(redacted_summary_cat) %>%
enframe()
capture.output(
walk2(sumtabs_cat$value, sumtabs_cat$name, print_cat),
file = here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=FALSE
)
### numeric ----
sumtabs_num <-
data %>%
select(-ends_with("_id")) %>%
select(where(~ {!is.logical(.x) & is.numeric(.x) & !is.Date(.x)})) %>%
map(redacted_summary_num) %>%
enframe()
capture.output(
walk2(sumtabs_num$value, sumtabs_num$name, print_num),
file = here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=TRUE
)
### dates ----
sumtabs_date <-
data %>%
select(-ends_with("_id")) %>%
select(where(is.Date)) %>%
map(redacted_summary_date) %>%
enframe()
capture.output(
walk2(sumtabs_date$value, sumtabs_date$name, print_num),
file = here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=TRUE
)