generated from opensafely/research-template
/
01_data_properties.R
108 lines (83 loc) · 2.74 KB
/
01_data_properties.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
######################################
# This script:
# - Takes a dataset and summarises the variables using:
# * skimr::skim(),
# * class(),
# * walk2()
# - It saves the output to a .txt file
# - The script should only be run via an action in the project.yaml only
# - The script must be accompanied by two arguments
# - The first is the dataset, saved as an .rds file, that is to be summarised
# - The second in the directory where the txt output will be saved
######################################
# Preliminaries ----
## Import libraries
library('tidyverse')
source(here::here("lib", "redaction_functions.R"))
## Import command-line arguments ----
args <- commandArgs(trailingOnly=TRUE)
rds_file <- args[[1]]
output_dir <- args[[2]]
# rds_file <- "output/data/data_all.rds"
# output_dir <- "output/data_properties"
stopifnot("must pass an .rds file" = fs::path_ext(rds_file) == "rds")
## Base file name
filenamebase <- fs::path_ext_remove(fs::path_file(rds_file))
## Import processed data ----
data <- readr::read_rds(here::here(rds_file))
# Output summary .txt ----
options(width = 200) # set output width for capture.output
dir.create(here::here(output_dir), showWarnings = FALSE, recursive=TRUE)
# Summaries
## High-level variable overview
capture.output(
skimr::skim_without_charts(data),
file = here::here(output_dir, paste0(filenamebase, "_skim", ".txt")),
split = FALSE
)
## List of column types
capture.output(
lapply(data, class),
file = here::here(output_dir, paste0(filenamebase, "_coltypes", ".txt"))
)
## Tabulated data
### Delete file if it exists
if(file.exists(here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")))){
file.remove(here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")))
}
### Categorical and logical
sumtabs_cat <-
data %>%
select(-ends_with("_id")) %>%
select(where(is.character), where(is.logical), where(is.factor)) %>%
map(redacted_summary_cat) %>%
enframe()
capture.output(
walk2(sumtabs_cat$value, sumtabs_cat$name, print_cat),
file = here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=FALSE
)
### Numeric
sumtabs_num <-
data %>%
select(-ends_with("_id")) %>%
select(where(~ {!is.logical(.x) & is.numeric(.x) & !is.Date(.x)})) %>%
map(redacted_summary_num) %>%
enframe()
capture.output(
walk2(sumtabs_num$value, sumtabs_num$name, print_num),
file = here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=TRUE
)
### Dates
sumtabs_date <-
data %>%
select(-ends_with("_id")) %>%
select(where(is.Date)) %>%
map(redacted_summary_date) %>%
enframe()
capture.output(
walk2(sumtabs_date$value, sumtabs_date$name, print_num),
file = here::here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=TRUE
)