-
Notifications
You must be signed in to change notification settings - Fork 54
/
impute-median.R
135 lines (109 loc) · 2.76 KB
/
impute-median.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#' Impute the median value into a vector with missing values
#'
#' @param x vector
#'
#' @return vector with median values replaced
#' @export
#' @name impute_median
#'
#' @examples
#'
#' vec <- rnorm(10)
#'
#' vec[sample(1:10, 3)] <- NA
#'
#' impute_median(vec)
#'
impute_median <- function(x) UseMethod("impute_median")
#' @export
#' @rdname impute_median
impute_median.default <- function(x){
x[is.na(x)] <- median(x, na.rm = TRUE)
x
}
#' @export
#' @rdname impute_median
impute_median.factor <- function(x){
i_mode <- function(x){
tab <- table(x)
max_tab <- max(tab)
if (all(tab == max_tab)) {mod = NA}
if (is.numeric(x)) {
mod <- as.numeric(names(tab)[tab == max_tab])
}
mod <- names(tab)[tab == max_tab]
# randomly break a tie
return(sample(mod, 1))
}
x[is.na(x)] <- i_mode(x)
x
}
#' Scoped variants of `impute_median`
#'
#' `impute_median` imputes the median for a vector. To get it to work on all
#' variables, use `impute_median_all`. To only impute variables
#' that satisfy a specific condition, use the scoped variants,
#' `impute_below_at`, and `impute_below_if`. To use `_at` effectively,
#' you must know that `_at`` affects variables selected with a character
#' vector, or with `vars()`.
#'
#' @param .tbl a data.frame
#' @param .vars variables to impute
#' @param .predicate variables to impute
#' @name scoped-impute_median
#'
#' @return an dataset with values imputed
#' @export
#'
#' @examples
#' # select variables starting with a particular string.
#' library(dplyr)
#' impute_median_all(airquality)
#'
#' impute_median_at(airquality,
#' .vars = c("Ozone", "Solar.R"))
#'
#' impute_median_at(airquality,
#' .vars = vars(Ozone))
#'
#' impute_median_if(airquality,
#' .predicate = is.numeric)
#'
#' \dontrun{
#' library(ggplot2)
#' airquality %>%
#' bind_shadow() %>%
#' impute_median_all() %>%
#' add_label_shadow() %>%
#' ggplot(aes(x = Ozone,
#' y = Solar.R,
#' colour = any_missing)) +
#' geom_point()
#' }
#'
impute_median_all <- function(.tbl){
test_if_dataframe(.tbl)
test_if_null(.tbl)
dplyr::mutate_all(.tbl = .tbl,
.funs = impute_median)
}
#' @export
#' @rdname scoped-impute_median
impute_median_at <- function(.tbl,
.vars){
test_if_dataframe(.tbl)
test_if_null(.tbl)
dplyr::mutate_at(.tbl = .tbl,
.vars = .vars,
.funs = impute_median)
}
#' @export
#' @rdname scoped-impute_median
impute_median_if <- function(.tbl,
.predicate){
test_if_dataframe(.tbl)
test_if_null(.tbl)
dplyr::mutate_if(.tbl = .tbl,
.predicate = .predicate,
.funs = impute_median)
}