generated from opensafely/research-template
/
had_bmi_regression_2020_multivariate.R
122 lines (82 loc) · 4.29 KB
/
had_bmi_regression_2020_multivariate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
##########################################
## Author: Miriam Samuel
## Updated: 17th March 2022
## Multivariate analysis of who had BMI measured in 2020
## packages
library(broom)
library(purrr)
library(dplyr)
library(janitor)
library(tidyverse)
library(arrow)
#read in file
BMI_complete_categories <- read_feather (here::here ("output/data", "BMI_complete_median.feather"))
## select the variables needed for analysis
BMI_complete_categories_2020 <- BMI_complete_categories
## Filter by year
BMI_complete_categories_2020 <- BMI_complete_categories_2020 %>%
ungroup %>%
dplyr::filter(year==2020) %>%
dplyr::select(patient_id,
had_bmi,
sex,
age_group,
region,
imd,
ethnic_no_miss,
eth_group_16,
precovid_obese_flag,
starts_with("comorbid_"))
## convert had_bmi to a logical output
BMI_complete_categories_2020 %>%
dplyr::mutate(had_bmi = as.logical(had_bmi))
## Try to change base level >> NOTE: co-efficient for base group = log.odds of event in base group
BMI_complete_categories_2020 <- BMI_complete_categories_2020 %>%
dplyr::mutate(age_group = as.factor(age_group)) %>%
dplyr::mutate(age_group = fct_relevel(age_group, "18-39", after = 0))
## instructions from Epi R handbook
# create a new vector of explanatory variables - exclude age, gender, ethnicity
explanatory_vars_multivariate <- c("region",
"imd",
"precovid_obese_flag",
"comorbid_learning_disability",
"comorbid_depression",
"comorbid_dementia",
"comorbid_psychosis_schiz_bipolar",
"comorbid_diabetes_type",
"comorbid_diabetes_t1",
"comorbid_diabetes_t2",
"comorbid_asthma",
"comorbid_COPD",
"comorbid_stroke_and_TIA",
"comorbid_chronic_cardiac",
"comorbid_hypertension",
"comorbid_all_cancer")
# >> Example model
# Multivariate model with age +gender
# had_bmi_age_sex_m <- glm(had_bmi ~ age_group + sex, data=BMI_complete_categories_2020, family=binomial) %>%
# broom::tidy(exponentiate = TRUE, conf.int = TRUE) %>% # exponentiate and produce CIs
# dplyr::mutate(across(where(is.numeric), round, digits = 2)) # round all numeric columns
#>> Use PURR to loop over the different exposures in a univariate analysis and create a combined table
#>> use stringer to create a vector listing each item to run the logistic regression over
models_age_sex_ethnic_2020 <- explanatory_vars_multivariate %>% # begin with variables of interest
str_c("had_bmi ~ age_group + sex + eth_group_16 + ", .) %>% ## creates a vector of character with terms for age, gender and ethnicity regression
# iterate through each univariate formula ... using map function from purr
map( # Map each element of the preceding vector the following formula
.f = ~glm( # pass the formulas one-by-one to glm()
formula = as.formula(.x), # within glm(), the string formula is .x
family = "binomial", # specify type of glm (logistic)
data = BMI_complete_categories_2020)) %>% # dataset
# tidy up each of the glm regression outputs from above
map(
.f = ~tidy(
.x,
exponentiate = TRUE, # exponentiate
conf.int = TRUE)) %>% # return confidence intervals
# collapse the list of regression outputs in to one data frame
bind_rows() %>%
# round all numeric columns
mutate(across(where(is.numeric), round, digits = 2))
####################################################################################
## OUTPUTS
write.csv (models_age_sex_ethnic_2020, here::here ("output/data","multivariate_regression_had_bmi_2020.csv"))