Skip to content
Permalink
master
Go to file
 
 
Cannot retrieve contributors at this time
235 lines (202 sloc) 7.6 KB
#' Summarise the missingness in each variable
#'
#' Provide a summary for each variable of the number, percent missings, and
#' cumulative sum of missings of the order of the variables. By default,
#' it orders by the most missings in each variable.
#'
#' @param data a data.frame
#' @param order a logical indicating whether to order the result by `n_miss`.
#' Defaults to TRUE. If FALSE, order of variables is the order input.
#' @param add_cumsum logical indicating whether or not to add the cumulative
#' sum of missings to the data. This can be useful when exploring patterns
#' of nonresponse. These are calculated as the cumulative sum of the missings
#' in the variables as they are first presented to the function.
#' @param ... extra arguments
#'
#' @note `n_miss_cumsum` is calculated as the cumulative sum of missings in the
#' variables in the order that they are given in the data when entering
#' the function
#'
#' @return a tibble of the percent of missing data in each variable
#'
#' @seealso [pct_miss_case()] [prop_miss_case()] [pct_miss_var()] [prop_miss_var()] [pct_complete_case()] [prop_complete_case()] [pct_complete_var()] [prop_complete_var()] [miss_prop_summary()] [miss_case_summary()] [miss_case_table()] [miss_summary()] [miss_var_prop()] [miss_var_run()] [miss_var_span()] [miss_var_summary()] [miss_var_table()] [n_complete()] [n_complete_row()] [n_miss()] [n_miss_row()] [pct_complete()] [pct_miss()] [prop_complete()] [prop_complete_row()] [prop_miss()]
#'
#' @export
#'
#' @examples
#'
#' miss_var_summary(airquality)
#' miss_var_summary(oceanbuoys, order = TRUE)
#'
#' \dontrun{
#' # works with group_by from dplyr
#' library(dplyr)
#' airquality %>%
#' group_by(Month) %>%
#' miss_var_summary()
#' }
#' @export
miss_var_summary <- function(data,
order = FALSE,
add_cumsum = FALSE,
...) {
test_if_null(data)
test_if_dataframe(data)
UseMethod("miss_var_summary")
}
#' @export
miss_var_summary.default <- function(data,
order = TRUE,
add_cumsum = FALSE,
...) {
col_n_miss <- colSums(is.na(data))
col_pct_miss <- colMeans(is.na(data)) * 100
res <- tibble::tibble(variable = names(col_n_miss),
n_miss = as.integer(col_n_miss),
pct_miss = as.numeric(col_pct_miss))
if (add_cumsum) {
res <- res %>% dplyr::mutate(n_miss_cumsum = cumsum(n_miss))
}
if (order) {
return(dplyr::arrange(res, -n_miss))
}
return(res)
}
#' @export
miss_var_summary.grouped_df <- function(data,
order = TRUE,
add_cumsum = FALSE,
...) {
group_by_fun(data,
.fun = miss_var_summary,
order = order,
add_cumsum = add_cumsum)
}
#' Summarise the missingness in each case
#'
#' Provide a summary for each case in the data of the number, percent missings,
#' and cumulative sum of missings of the order of the variables. By default,
#' it orders by the most missings in each variable.
#'
#' @param data a data.frame
#' @param order a logical indicating whether or not to order the result by
#' n_miss. Defaults to TRUE. If FALSE, order of cases is the order input.
#' @param ... extra arguments
#' @param add_cumsum logical indicating whether or not to add the cumulative
#' sum of missings to the data. This can be useful when exploring patterns
#' of nonresponse. These are calculated as the cumulative sum of the missings
#' in the variables as they are first presented to the function.
#'
#' @return a tibble of the percent of missing data in each case.
#'
#' @seealso [pct_miss_case()] [prop_miss_case()] [pct_miss_var()] [prop_miss_var()] [pct_complete_case()] [prop_complete_case()] [pct_complete_var()] [prop_complete_var()] [miss_prop_summary()] [miss_case_summary()] [miss_case_table()] [miss_summary()] [miss_var_prop()] [miss_var_run()] [miss_var_span()] [miss_var_summary()] [miss_var_table()] [n_complete()] [n_complete_row()] [n_miss()] [n_miss_row()] [pct_complete()] [pct_miss()] [prop_complete()] [prop_complete_row()] [prop_miss()]
#'
#' @export
#'
#' @examples
#'
#' miss_case_summary(airquality)
#'
#' \dontrun{
#' # works with group_by from dplyr
#' library(dplyr)
#' airquality %>%
#' group_by(Month) %>%
#' miss_case_summary()
#'}
#'
miss_case_summary <- function(data,
order = TRUE,
add_cumsum = FALSE,
...){
test_if_null(data)
test_if_dataframe(data)
UseMethod("miss_case_summary")
}
#' @export
miss_case_summary.default <- function(data,
order = TRUE,
add_cumsum = FALSE,
...){
res <- data
res[["pct_miss"]] <- rowMeans(is.na(res))*100
res[["n_miss"]] <- as.integer(rowSums(is.na(res)))
res[["case"]] <- seq_len(nrow(res))
if (add_cumsum) {
res[["n_miss_cumsum"]] <- cumsum(res[["n_miss"]])
res <- dplyr::as_tibble(res)
res <- dplyr::select(res,
case,
n_miss,
pct_miss,
n_miss_cumsum)
}
if (!add_cumsum) {
res <- dplyr::as_tibble(res)
res <- dplyr::select(res,
case,
n_miss,
pct_miss)
}
if (order) {
return(dplyr::arrange(res, -n_miss))
}
if (!order) {
return(res)
}
}
#' @export
miss_case_summary.grouped_df <- function(data,
order = TRUE,
add_cumsum = FALSE,
...){
group_by_fun(data,
.fun = miss_case_summary,
order = order,
add_cumsum = add_cumsum)
}
#' Collate summary measures from naniar into one tibble
#'
#' `miss_summary` performs all of the missing data helper summaries and puts
#' them into lists within a tibble
#'
#' @param data a dataframe
#' @param order whether or not to order the result by n_miss
#'
#' @return a tibble of missing data summaries
#'
#' @seealso [pct_miss_case()] [prop_miss_case()] [pct_miss_var()] [prop_miss_var()] [pct_complete_case()] [prop_complete_case()] [pct_complete_var()] [prop_complete_var()] [miss_prop_summary()] [miss_case_summary()] [miss_case_table()] [miss_summary()] [miss_var_prop()] [miss_var_run()] [miss_var_span()] [miss_var_summary()] [miss_var_table()] [n_complete()] [n_complete_row()] [n_miss()] [n_miss_row()] [pct_complete()] [pct_miss()] [prop_complete()] [prop_complete_row()] [prop_miss()]
#'
#' @export
#'
#' @examples
#'
#' s_miss <- miss_summary(airquality)
#' s_miss$miss_df_prop
#' s_miss$miss_case_table
#' s_miss$miss_var_summary
#' # etc, etc, etc.
#'
#' \dontrun{
#' library(dplyr)
#' s_miss_group <- group_by(airquality, Month) %>% miss_summary()
#' s_miss_group$miss_df_prop
#' s_miss_group$miss_case_table
#' # etc, etc, etc.
#' }
#'
miss_summary <- function(data, order = TRUE){
test_if_null(data)
test_if_dataframe(data)
return(
tibble::tibble(
miss_df_prop = prop_miss(data),
miss_var_prop = prop_miss_var(data),
miss_case_prop = prop_miss_case(data),
miss_case_table = list(miss_case_table(data)),
miss_var_table = list(miss_var_table(data)),
miss_var_summary = list(miss_var_summary(data, order)),
miss_case_summary = list(miss_case_summary(data, order))
)
)
}
You can’t perform that action at this time.