/
add-cols.R
384 lines (329 loc) · 13.4 KB
/
add-cols.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
#' Add a shadow column to dataframe
#'
#' As an alternative to `bind_shadow()`, you can add specific individual shadow
#' columns to a dataset. These also respect the dplyr verbs
#' `starts_with`, `contains`, `ends_with`, etc.
#'
#' @param data data.frame
#' @param ... One or more unquoted variable names, separated by commas. These also
#' respect the dplyr verbs `starts_with`, `contains`, `ends_with`, etc.
#'
#' @return data.frame
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_shadow(Ozone)
#' airquality %>% add_shadow(Ozone, Solar.R)
#'
add_shadow <- function(data, ...){
test_if_dots_missing(
dots_empty = missing(...),
msg = "{.fun add_shadow} requires variables to be selected"
)
shadow_df <- dplyr::select(data, ...) %>% as_shadow()
data <- tibble::as_tibble(data)
shadow_df <- tibble::as_tibble(shadow_df)
dplyr::bind_cols(data, shadow_df)
}
#' Add a shadow shifted column to a dataset
#'
#' Shadow shift missing values using only the selected variables in a dataset,
#' by specifying variable names or use dplyr `vars` and dplyr verbs
#' `starts_with`, `contains`, `ends_with`, etc.
#'
#' @param data data.frame
#' @param ... One or more unquoted variable names separated by commas. These also
#' respect the dplyr verbs `starts_with`, `contains`, `ends_with`, etc.
#' @param suffix suffix to add to variable, defaults to "shift"
#'
#' @return data with the added variable shifted named as `var_suffix`
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_shadow_shift(Ozone, Solar.R)
#'
add_shadow_shift <- function(data, ..., suffix = "shift"){
# if no variables are selected use all of the variables
if (missing(...)) {
shadow_shifted_df <- purrr::map_dfc(data, impute_below)
# change names
names(shadow_shifted_df) <- paste0(names(shadow_shifted_df), "_", suffix)
data <- tibble::as_tibble(data)
shadow_shifted_df <- tibble::as_tibble(shadow_shifted_df)
return(dplyr::bind_cols(data, shadow_shifted_df))
}
# select variables
shadow_shifted_vars <- dplyr::select(data, ...)
# shadow shift all (using purrr:map_df)
# would be good to have a way of indicating that no shift was taken at all
shadow_shifted_df <- purrr::map_dfc(shadow_shifted_vars, impute_below)
# change names
names(shadow_shifted_df) <- paste0(names(shadow_shifted_df),"_",suffix)
data <- tibble::as_tibble(data)
shadow_shifted_df <- tibble::as_tibble(shadow_shifted_df)
return(dplyr::bind_cols(data, shadow_shifted_df))
}
#' Add a column describing presence of any missing values
#'
#' This adds a column named "any_miss" (by default) that describes whether
#' there are any missings in all of the variables (default), or whether any
#' of the specified columns, specified using variables names or dplyr verbs,
#' `starts_with`, `contains`, `ends_with`, etc. By default the added column
#' will be called "any_miss_all", if no variables are specified, otherwise,
#' if variables are specified, the label will be "any_miss_vars" to indicate
#' that not all variables have been used to create the labels.
#'
#'
#' By default the
#' prefix "any_miss" is used, but this can be changed in the `label` argument.
#'
#' @param data data.frame
#' @param ...
#'
#' Variable names to use instead of the whole dataset. By default this
#' looks at the whole dataset. Otherwise, this is one or more unquoted
#' expressions separated by commas. These also respect the dplyr verbs
#' `starts_with`, `contains`, `ends_with`, etc. By default will add "_all" to
#' the label if left blank, otherwise will add "_vars" to distinguish that it
#' has not been used on all of the variables.
#' @param label label for the column, defaults to "any_miss". By default if no
#' additional variables are listed the label col is "any_miss_all", otherwise
#' it is "any_miss_vars", if variables are specified.
#' @param missing character a label for when values are missing - defaults to "missing"
#' @param complete character character a label for when values are complete - defaults to "complete"
#'
#' @return data.frame with data and the column labelling whether that row (for
#' those variables) has any missing values - indicated by "missing" and
#' "complete".
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_any_miss()
#' airquality %>% add_any_miss(Ozone, Solar.R)
#'
add_any_miss <- function(data, ...,
label = "any_miss",
missing = "missing",
complete = "complete"){
# if no variables are specified, do for all, and add the label "all"
if (missing(...)) {
stub_data_label <- data %>%
dplyr::mutate(.temp = any_row_miss(data),
.temp_label = dplyr::if_else(condition = .temp == TRUE,
true = missing,
false = complete)) %>%
dplyr::select(.temp_label) %>%
tibble::as_tibble()
names(stub_data_label) <- paste0(label,"_all")
return(
dplyr::bind_cols(data, stub_data_label) %>% tibble::as_tibble()
)
}
stub_data <- dplyr::select(data, ...)
stub_data_label <- stub_data %>%
dplyr::mutate(.temp = any_row_miss(stub_data),
.temp_label = dplyr::if_else(condition = .temp == TRUE,
true = missing,
false = complete)) %>%
dplyr::select(.temp_label) %>%
tibble::as_tibble()
names(stub_data_label) <- paste0(label,"_vars")
dplyr::bind_cols(data, stub_data_label) %>% tibble::as_tibble()
}
#' Is there a missing value in the row of a dataframe?
#'
#' Creates a character vector describing presence/absence of missing values
#'
#' @param data a dataframe or set of vectors of the same length
#'
#' @return character vector of "Missing" and "Not Missing".
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' label_missings(airquality)
#'
#' \dontrun{
#' library(dplyr)
#'
#' airquality %>%
#' mutate(is_missing = label_missings(airquality)) %>%
#' head()
#'
#' airquality %>%
#' mutate(is_missing = label_missings(airquality,
#' missing = "definitely missing",
#' complete = "absolutely complete")) %>%
#' head()
#' }
label_missings <- function(data,
...,
missing = "Missing",
complete = "Not Missing"){
test_if_null(data)
# find which are missing and which are not.
any_row_na <- function(x){
apply(data.frame(x), MARGIN = 1, FUN = function(x) anyNA(x))
}
if (!missing(...)) {
data <- dplyr::select(data, ...)
}
temp <- any_row_na(data)
dplyr::if_else(condition = temp == TRUE, # TRUE means missing
true = missing,
false = complete)
}
#' Add a column describing if there are any missings in the dataset
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return data.frame with a column "any_missing" that is either "Not Missing"
#' or "Missing" for the purposes of plotting / exploration / nice print methods
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>% add_label_missings()
#' airquality %>% add_label_missings(Ozone, Solar.R)
#' airquality %>% add_label_missings(Ozone, Solar.R, missing = "yes", complete = "no")
#'
add_label_missings <- function(data,
...,
missing = "Missing",
complete = "Not Missing"){
# data %>%
# dplyr::mutate(any_missing = label_missings(.)) %>%
# dplyr::as_tibble()
updated_data <- data %>%
dplyr::mutate(any_missing = label_missings(.,
...,
missing = missing,
complete = complete))
return(tibble::as_tibble(updated_data))
}
#' Label shadow values as missing or not missing
#'
#' Powers `add_label_shadow`. For the moment it is an internal function.
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return "Missing" or "Not Missing"
#' @keywords internal
#' @noRd
#'
label_shadow <- function(data,
...,
missing = "Missing",
complete = "Not Missing"){
# any_shade <- function(x) any(grepl("^NA|^NA_", x))
if (!missing(...)) {
shadow_vars <- quo_to_shade(...)
data <- dplyr::select(data, ..., !!!shadow_vars)
}
temp <- any_row_shade(data)
dplyr::if_else(condition = temp == TRUE, # TRUE means missing
true = missing,
false = complete)
}
#' Add a column describing whether there is a shadow
#'
#' Instead of focussing on labelling whether there are missings, we instead
#' focus on whether there have been any shadows created. This can be useful
#' when data has been imputed and you need to determine which rows contained
#' missing values when the shadow was bound to the dataset.
#'
#' @param data data.frame
#' @param ... extra variable to label
#' @param missing character a label for when values are missing - defaults to "Missing"
#' @param complete character character a label for when values are complete - defaults to "Not Missing"
#'
#' @return data.frame with a column, "any_missing", which describes whether or
#' not there are any rows that have a shadow value.
#'
#' @export
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @examples
#'
#' airquality %>%
#' add_shadow(Ozone, Solar.R) %>%
#' add_label_shadow()
#'
add_label_shadow <- function(data,
...,
missing = "Missing",
complete = "Not Missing"){
if (!any_shade(data)) {
rlang::abort("add_label_shadow works with shadow data, which has columns
created by `shade()`, `as_shadow()`, or `bind_shadow()`")
}
updated_data <- dplyr::mutate(data,
any_missing = label_shadow(data,
...,
missing = missing,
complete = complete))
return(updated_data)
}
#' Add a column that tells us which "missingness cluster" a row belongs to
#'
#' A way to extract the cluster of missingness that a group belongs to.
#' For example, if you use `vis_miss(airquality, cluster = TRUE)`, you can
#' see some clustering in the data, but you do not have a way to identify
#' the cluster. Future work will incorporate the `seriation` package to
#' allow for better control over the clustering from the user.
#'
#' @param data a dataframe
#' @param cluster_method character vector of the agglomeration method to use,
#' the default is "mcquitty". Options are taken from `stats::hclust`
#' helpfile, and options include: "ward.D", "ward.D2", "single", "complete",
#' "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or
#' "centroid" (= UPGMC).
#' @param n_clusters numeric the number of clusters you expect. Defaults to 2.
#'
#' @seealso [bind_shadow()] [add_any_miss()] [add_label_missings()] [add_label_shadow()] [add_miss_cluster()] [add_n_miss()] [add_prop_miss()] [add_shadow_shift()] [cast_shadow()]
#'
#' @export
#'
#' @examples
#'
#' add_miss_cluster(airquality)
#' add_miss_cluster(airquality, n_clusters = 3)
#' add_miss_cluster(airquality, cluster_method = "ward.D", n_clusters = 3)
add_miss_cluster <- function(data,
cluster_method = "mcquitty",
n_clusters = 2) {
test_if_null(data)
test_if_dataframe(data)
data_na <- is.na(data)
miss_cluster <- stats::dist(data_na*1) %>%
stats::hclust(method = cluster_method) %>%
stats::cutree(k = n_clusters)
data$miss_cluster <- miss_cluster
tibble::as_tibble(data)
}