generated from opensafely/research-template
/
020_baseline_characteristics.R
161 lines (128 loc) · 6.1 KB
/
020_baseline_characteristics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# Program Information ----------------------------------------------------
# Program: 020_baseline_characteristics
# Author: Anna Schultze
# Description: Summarise cleaned study_population file into a table one of
# baseline characteristics
# Input: study_population_[year].csv
# Output: table1_[year].txt
# Edits:
# Housekeeping -----------------------------------------------------------
# load packages
library(tidyverse)
library(data.table)
library(janitor)
library(lubridate)
# make sure my favoured output folder exists
mainDir <- getwd()
subDir <- "./analysis/outfiles"
if (file.exists(subDir)){
print("Out directory exists")
} else {
dir.create(file.path(mainDir, subDir))
print("Out directory didn't exist, but I created it")
}
# Read in arguments supplied through project.yaml
# this allows the script to be run for several study populations at different times
args = commandArgs(trailingOnly=TRUE)
print("These are my input arguments")
print(args[1])
print(args[2])
inputdata <- toString(args[1])
outputdata <- toString(args[2])
# Read in Data ------------------------------------------------------------
study_population <- fread(inputdata, data.table = FALSE, na.strings = "")
# Data Management ----------------------------------------------------------
# table of baseline characteristics by care home vs. not care home and overall
# create row grouping for overall estimates
overall_summary <-study_population %>%
mutate(care_home_group = "Overall")
# create row grouping for care gome status and add overall grouping
summary <- study_population %>%
mutate(care_home_group = ifelse(care_home, "Care_or_Nursing_Home", "Private_Home")) %>%
bind_rows(overall_summary)
# create variable always == 1 for a total row
summary <- summary %>%
mutate(total = 1)
tabyl(summary$care_home_group)
# Define Functions --------------------------------------------------------
# these are not particularly generalised at the moment
# I wanted to write small discrete functions that summarised and then reformatted a single variable, which can then be applied many times to build a table one
# suggestions for improvements v welcome, particularly around how to get these to save what they are doing to the dataset, current solution a little hacky
# function for summarising categorical variable in table format
#-- the input values are:
#-- x = the variable (binary or factor)
#-- y = A "display name" for the table, as I can't get R to like labels
#-- the function 1) calculates counts & percentages by care home,
#-- 2) reorders the columns to be in a logical order
#-- 3) adds some descriptive text for printing (variable and level names)
tabulate_me <- function(x, y) {
table <- summary %>%
group_by(care_home_group, {{x}}) %>%
summarise(Count = n()) %>%
mutate(Percentage = round((Count/sum(Count)),4)*100) %>%
pivot_wider(names_from = c(care_home_group), values_from=c(Count, Percentage),
names_glue = "{care_home_group}_{.value}") %>%
rename(varlevel = {{x}}) %>%
mutate(varlevel = as.character(varlevel)) %>%
select(varlevel, (matches("Over*")), (matches("Care*")), (matches("Priv*"))) %>%
mutate(row = row_number()) %>%
mutate(varname = case_when(
row == 1 ~ as_label(enquo(y))
)) %>%
select(varname, everything()) %>%
select(-c(row))
bind_rows(table_1, table)
}
# function for summarising continous variables
#-- the input values are:
#-- x = the variable (numeric)
#-- y = display name
#-- the function is as above, but presents mean and SD
summarise_me <- function(x, y) {
table <- summary %>%
group_by(care_home_group) %>%
summarise(Mean = round(mean({{x}}),0), SD = round(sd({{x}}),0)) %>%
pivot_wider(names_from = c(care_home_group), values_from=c(Mean, SD),
names_glue = "{care_home_group}_{.value}") %>%
rename(Overall_Count = Overall_Mean,
Overall_Percentage = Overall_SD,
Care_or_Nursing_Home_Count = Care_or_Nursing_Home_Mean,
Care_or_Nursing_Home_Percentage = Care_or_Nursing_Home_SD,
Private_Home_Count = Private_Home_Mean,
Private_Home_Percentage = Private_Home_SD) %>%
mutate(varlevel = "Mean, SD",
varlevel = as.character(varlevel)) %>%
select(varlevel, (matches("Over*")), (matches("Care*")), (matches("Priv*"))) %>%
mutate(row = row_number()) %>%
mutate(varname = case_when(
row == 1 ~ as_label(enquo(y))
)) %>%
select(varname, everything()) %>%
select(-c(row))
bind_rows(table_1, table)
}
# create an empty table structure to append generated tables to
# if someone can think of a neater way to do this, please let me know
# i think I might be approaching it too much like table outputting in SAS
table_1 <- NULL
# tabulate and summarise the variables I want
table_1 <- tabulate_me(x = total, y = Total)
table_1 <- tabulate_me(x = care_home_cat, y = Care_Home_Type)
table_1 <- tabulate_me(x = sex, y = Gender)
table_1 <- summarise_me(x = age, y = Age_in_Years)
table_1 <- tabulate_me(x = ethnicity_cat, y = Self-reported_Ethnicity)
table_1 <- tabulate_me(x = region, y = Geographical_Region)
table_1 <- tabulate_me(x = rural_urban, y = Rural_or_Urban_Area)
table_1 <- tabulate_me(x = imd_cat, y = Quintile_of_Index_of_Multiple_Deprivation)
table_1 <- tabulate_me(x = diabetes, y = Diabetes)
table_1 <- tabulate_me(x = ckd, y = Chronic_Kidney_Disease)
table_1 <- tabulate_me(x = lung_cancer, y = Lung_Cancer)
table_1 <- tabulate_me(x = haem_cancer, y = Haematological_Cancer)
table_1 <- tabulate_me(x = other_cancer, y = Other_Cancer)
table_1 <- tabulate_me(x = chronic_liver_disease, y = Chronic_Liver_Disease)
table_1 <- tabulate_me(x = chronic_cardiac_disease, y = Chronic_Cardiac_Disease)
table_1 <- tabulate_me(x = chronic_respiratory_disease, y = Chronic_Respiratory_Disease)
table_1 <- tabulate_me(x = stroke, y = History_of_Stroke)
table_1 <- tabulate_me(x = dementia, y = Dementia)
# export tbe table as a nice text file
write.table(table_1, file = outputdata, sep = "\t", na = "", row.names=FALSE)