generated from opensafely/research-template
/
data_properties.R
133 lines (103 loc) · 3.39 KB
/
data_properties.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
################################################################################
#
# Description: This script takes a dataset, summarises the variables using:
# * skimr::skim(),
# * class(),
# * and ,
# and saves the output to a .txt file
#
# The script should only be run via an action in the project.yaml only
# The script must be accompanied by two arguments
# The first is the dataset that is to be summarised
# The second in the directory where the txt output will be saved
#
# Input: /output/input_control.csv
#
# Output: /output/data_properties/input_control*.txt
#
# Author(s): W. Hulme (edited by M. Green)
# Date last updated: 04/07/2023
#
################################################################################
# Preliminaries ----
## Import libraries
library('tidyverse')
library('here')
library('lubridate')
## Custom functions
source(here("analysis", "lib", "custom_functions.R"))
## Import command-line arguments
args <- commandArgs(trailingOnly=TRUE)
# Data properties tables ----
if(length(args)==0){
# use for interactive testing
csv_files <- list("output/input_control.csv",
"output/input_treatment.csv")
output_dir <- "output/data_properties"
} else {
csv_files <- list(args[[1]], args[[2]])
output_dir <- args[[3]]
}
for (i in 1:length(csv_files)){
## Specify data
csv_file <- csv_files[[i]]
stopifnot("must pass an .csv file" = fs::path_ext(csv_file)=="csv")
filenamebase <- fs::path_ext_remove(fs::path_file(csv_file))
## Import data
data <- readr::read_csv(here(csv_file))
# Output summary .txt
options(width=200) # set output width for capture.output
dir.create(here(output_dir), showWarnings = FALSE, recursive=TRUE)
## High-level variable overview ----
capture.output(
skimr::skim_without_charts(data),
file = here(output_dir, paste0(filenamebase, "_skim", ".txt")),
split=FALSE
)
## list of column types ----
capture.output(
lapply(data, class),
file = here(output_dir, paste0(filenamebase, "_coltypes", ".txt"))
)
## tabulated data ----
# delete file if it exists
if(file.exists(here(output_dir, paste0(filenamebase, "_tabulate", ".txt")))){
file.remove(here(output_dir, paste0(filenamebase, "_tabulate", ".txt")))
}
### categorical and logical ----
sumtabs_cat <-
data %>%
select(-ends_with("_id")) %>%
select(where(is.character), where(is.logical), where(is.factor)) %>%
map(redacted_summary_cat) %>%
enframe()
capture.output(
walk2(sumtabs_cat$value, sumtabs_cat$name, print_cat),
file = here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=FALSE
)
### numeric ----
sumtabs_num <-
data %>%
select(-ends_with("_id")) %>%
select(where(~ {!is.logical(.x) & is.numeric(.x) & !is.Date(.x)})) %>%
map(redacted_summary_num) %>%
enframe()
capture.output(
walk2(sumtabs_num$value, sumtabs_num$name, print_num),
file = here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=TRUE
)
### dates ----
sumtabs_date <-
data %>%
select(-ends_with("_id")) %>%
select(where(is.Date)) %>%
map(redacted_summary_date) %>%
enframe()
capture.output(
walk2(sumtabs_date$value, sumtabs_date$name, print_num),
file = here(output_dir, paste0(filenamebase, "_tabulate", ".txt")),
append=TRUE
)
}