generated from opensafely/research-template
/
preprocess_data.R
141 lines (106 loc) · 5.78 KB
/
preprocess_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
##################################################################################
#
# Description: This script reads in the input data and prepares it for data cleaning.
#
# Input: output/input.feather
# Output: output/
#
# Author(s): Rachel Denholm, Kurt Taylor
#
# Date last updated:
#
##################################################################################
# Load libraries ---------------------------------------------------------------
library(magrittr)
library(tidyverse)
library(lubridate)
# FILE PATHS
fs::dir_create(here::here("output", "not-for-review"))
fs::dir_create(here::here("output", "review"))
# Define parameters ------------------------------------------------------------
## Study start date
study_start <- "2020-01-01"
## Load dataset
df <- arrow::read_feather(file = "output/input.feather")
# create vars -------------------------------------------------------------
# vars could not be created in common vars file
df <- df %>% mutate(tmp_out_count_t2dm = tmp_out_count_t2dm_snomed + tmp_out_count_t2dm_hes,
tmp_out_count_t1dm = tmp_out_count_t1dm_snomed + tmp_out_count_t1dm_hes) %>%
# cholesterol ratio
mutate(cov_num_tc_hdl_ratio = tmp_cov_num_cholesterol / tmp_cov_num_hdl_cholesterol)
# replace NaN and Inf with NA's (probably only an issue with dummy data)
df$cov_num_tc_hdl_ratio[is.nan(df$cov_num_tc_hdl_ratio)] <- NA
df$cov_num_tc_hdl_ratio[is.infinite(df$cov_num_tc_hdl_ratio)] <- NA
print("Diabetes count variables created successfully")
# Combine BMI variables to create one history of obesity variable ---------------
df <- df %>%
mutate(cov_bin_obesity = ifelse(cov_bin_obesity == TRUE | cov_cat_bmi_groups == "Obese", TRUE, FALSE)) %>%
dplyr::select(- cov_num_bmi)
# Format columns -----------------------------------------------------
# dates, numerics, factors, logicals
df <- df %>%
rename(tmp_out_max_hba1c_mmol_mol_date = tmp_out_num_max_hba1c_date,
tmp_out_bmi_date_measured = cov_num_bmi_date_measured) %>%
mutate(across(contains('_date'), ~ as.Date(as.character(.)))) %>%
mutate(across(contains('_birth_year'), ~ format(as.Date(.), "%Y"))) %>%
mutate(across(contains('_num'), ~ as.numeric(.))) %>%
mutate(across(contains('_cat'), ~ as.factor(.))) %>%
mutate(across(contains('_bin'), ~ as.logical(.)))
print("Columns formatted successfully")
# Define COVID-19 severity --------------------------------------------------------------
df <- df %>%
mutate(sub_cat_covid19_hospital =
ifelse(!is.na(exp_date_covid19_confirmed) &
!is.na(sub_date_covid19_hospital) &
sub_date_covid19_hospital - exp_date_covid19_confirmed >= 0 &
sub_date_covid19_hospital - exp_date_covid19_confirmed < 29, "hospitalised",
ifelse(!is.na(exp_date_covid19_confirmed), "non_hospitalised",
ifelse(is.na(exp_date_covid19_confirmed), "no_infection", NA)))) %>%
mutate(across(sub_cat_covid19_hospital, factor))
# Define diabetes outcome (using Sophie Eastwood algorithm) ----------------------------
# define variables needed for diabetes algorithm
df <- df %>%
mutate(tmp_out_year_first_diabetes_diag = format(tmp_out_date_first_diabetes_diag,"%Y")) %>%
mutate(tmp_out_year_first_diabetes_diag = as.integer(tmp_out_year_first_diabetes_diag),
age_1st_diag = tmp_out_year_first_diabetes_diag - qa_num_birth_year) %>%
mutate(age_1st_diag = replace(age_1st_diag, which(age_1st_diag < 0), NA)) %>% # assign negative ages to NA)
mutate(age_under_35_30_1st_diag = ifelse(!is.na(age_1st_diag) &
(age_1st_diag < 35 &
(cov_cat_ethnicity == 1 | cov_cat_ethnicity == 2 | cov_cat_ethnicity == 5)) |
(age_1st_diag < 30), "Yes", "No")) %>%
# HBA1C date var - earliest date for only those with >=47.5
mutate(hba1c_date_step7 = as_date(case_when(tmp_out_num_max_hba1c_mmol_mol >= 47.5 ~ pmin(tmp_out_max_hba1c_mmol_mol_date, na.rm = TRUE))),
# process codes - this is taking the first process code date in those individuals that have 5 or more process codes
over5_pocc_step7 = as_date(case_when(tmp_out_count_poccdm_snomed >= 5 ~ pmin(out_date_poccdm, na.rm = TRUE))))
print("COVID-19 and diabetes variables needed for algorithm created successfully")
# Define diabetes outcome (using Sophie Eastwood algorithm) ----------------------------
scripts_dir <- "analysis/preprocess"
source(file.path(scripts_dir,"diabetes_algorithm.R"))
df <- diabetes_algo(df)
print("Diabetes algorithm run successfully")
# Restrict columns and save analysis dataset ---------------------------------
df1 <- df %>%
dplyr::select(- vax_jcvi_age_1, - vax_jcvi_age_2) %>% # remove JCVI variables
# select patient id, death date and variables: subgroups, exposures, outcomes, covariates, quality assurance and vaccination
# need diabetes "step" variables for flowchart (diabetes_flowchart.R)
dplyr::select(patient_id, death_date,
contains(c("sub_", "exp_", "out_", "cov_", "qa_", "vax_", "step"))) %>%
dplyr::select(-contains("df_out_")) %>%
dplyr::select(-contains("tmp_"))
# Describe data --------------------------------------------------------------
sink(paste0("output/not-for-review/describe_input_stage0.txt"))
print(Hmisc::describe(df1))
sink()
# SAVE
saveRDS(df1, file = paste0("output/input.rds"))
print("Dataset saved successfully")
# Restrict columns and save Venn diagram input dataset -----------------------
# df2 <- df %>%
# dplyr::select(patient_id,
# starts_with(c("out_")))
# SAVE
## create folders for outputs
# fs::dir_create(here::here("output", "venn"))
saveRDS(df, file = paste0("output/venn.rds"))
print("Venn dataset saved successfully")
# END