generated from opensafely/research-template
/
MICE_data_prep2.R
132 lines (76 loc) · 2.96 KB
/
MICE_data_prep2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
## This R script develops the imputed data frame
## Specify libraries
library(pacman)
library(tidyverse)
library(Hmisc)
library(here)
library(arrow)
library(purrr)
library(broom)
library(data.table)
library(forcats)
library(rstatix)
library(janitor)
library(lubridate)
library(skimr)
library(ggplot2)
library(mice)
BMI_trajectories <- read_csv (here::here ("output/data", "imputation_data_long.csv"))
## remove patients with cancer and who were underweight at onset of pandemic
BMI_trajectories <- BMI_trajectories %>%
dplyr::filter(all_cancer != TRUE) %>%
dplyr::filter(precovid_bmi_category != "underweight")
## Sample
BMI_trajectories <- BMI_trajectories[sample(nrow(BMI_trajectories), 250000), ]
BMI_trajectories <- BMI_trajectories[ -c(1) ]
BMI_trajectories$imd <- factor(BMI_trajectories$imd,
levels = c('1','2','3','4','5'))
BMI_trajectories <- BMI_trajectories %>%
dplyr::select(-c(ends_with("_bmi"))) %>%
dplyr::select(-c("base_bmi_category")) %>%
dplyr::select(-c("postcovid_bmi_category")) %>%
dplyr::select(-c(ends_with("all_cancer")))
p_missing <- unlist(lapply(BMI_trajectories, function(x) sum(is.na(x))))/nrow(BMI_trajectories)
p_missing <- as.data.frame(sort(p_missing[p_missing > 0], decreasing = TRUE))
p_missing
# Remove variables from the MICE predictor frame
# We run the mice code with 0 iterations
imp <- mice(BMI_trajectories, maxit=0, seed = 123)
# Extract predictorMatrix and methods of imputation
predM <- imp$predictorMatrix
meth <- imp$method
head(predM)
## Remove variables from the predictor matrix
predM[, c("patient_id")] <- 0
#predM[, c("region")] <- 0
#predM[, c("imd")] <- 0
#predM[, c("hypertension")] <- 0
#predM[, c("diabetes_t1")] <- 0
#predM[, c("diabetes_t2")] <- 0
#predM[, c("chronic_cardiac")] <- 0
#predM[, c("COPD")] <- 0
#predM[, c("asthma")] <- 0
#predM[, c("learning_disability")] <- 0
#predM[, c("psychosis_schiz_bipolar")] <- 0
#predM[, c("depression")] <- 0
#predM[, c("stroke_and_TIA")] <- 0
#predM[, c("dementia")] <- 0
predM[, c("precovid_change")] <- 0
## methods used for imputation are appropriate, don't need to change
meth[c("age_group_2")]=""
meth[c("sex")]=""
meth[c("region")]=""
meth[c("eth_group_16")]=""
meth[c("imd")]=""
meth[c("smoking_status")]=""
## complete the imputation
# With this command, we tell mice to impute the anesimp2 data, create 5
# datasets, use predM as the predictor matrix and don't print the imputation
# process. If you would like to see the process, set print as TRUE
imp2 <- mice(BMI_trajectories, maxit = 5, seed = 123,
predictorMatrix = predM,
method = meth, print = TRUE)
BMI_imp_long <- mice::complete(imp2, action="long", include = TRUE)
write.csv (BMI_imp_long, here::here ("output/data", "imputation_dataframe.csv"))
write.csv (BMI_trajectories, here::here ("output/data", "imputation_DF_for_impute.csv"))
write.csv (p_missing, here::here ("output/data", "imputation_sample_missing.csv"))