Skip to content

Commit

Permalink
feat: Implement pivot_*() functions
Browse files Browse the repository at this point in the history
  • Loading branch information
etiennebacher authored and nathaneastwood committed Jul 31, 2022
1 parent af2940e commit 3fcf047
Show file tree
Hide file tree
Showing 8 changed files with 710 additions and 2 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
@@ -1,7 +1,7 @@
Package: poorman
Type: Package
Title: A Poor Man's Dependency Free Recreation of 'dplyr'
Version: 0.2.5.6
Version: 0.2.5.7
Authors@R: person("Nathan", "Eastwood", "", "nathan.eastwood@icloud.com",
role = c("aut", "cre"))
Maintainer: Nathan Eastwood <nathan.eastwood@icloud.com>
Expand All @@ -18,7 +18,7 @@ Suggests:
tinytest
License: MIT + file LICENSE
Encoding: UTF-8
RoxygenNote: 7.1.1
RoxygenNote: 7.2.0
Roxygen: list(markdown = TRUE)
VignetteBuilder: knitr
Language: en-GB
2 changes: 2 additions & 0 deletions NAMESPACE
Expand Up @@ -118,6 +118,8 @@ export(ntile)
export(num_range)
export(peek_vars)
export(percent_rank)
export(pivot_longer)
export(pivot_wider)
export(pull)
export(recode)
export(recode_factor)
Expand Down
138 changes: 138 additions & 0 deletions R/pivot_longer.R
@@ -0,0 +1,138 @@
#' Pivot data from wide to long
#'
#' `pivot_longer()` "lengthens" data, increasing the number of rows and decreasing the number of columns. The inverse
#' transformation is [pivot_wider()].
#'
#' @param data `data.frame`. The data to pivot.
#' @param cols <[`poor-select`][select_helpers]>. Columns to pivot into longer format.
#' @param names_to `character(n)`. The name of the new column(s) that will contain the column names.
#' @param names_prefix `character(1)`. A regular expression used to remove matching text from the start of each variable
#' name.
#' @param names_sep,names_pattern `character(1)`. If `names_to` contains multiple values, this argument controls how the
#' column name is broken up. `names_pattern` takes a regular expression containing matching groups (`()`⁠).
#' @param values_to `character(n)`. The name of the new column(s) that will contain the values of the pivoted variables.
#' @param values_drop_na `logical(1)`. If `TRUE`, will drop rows that contain only `NA` in the `values_to` column. This
#' effectively converts explicit missing values to implicit missing values, and should generally be used only when
#' missing values in data were created by its structure.
#' @param ... Additional arguments passed on to methods.
#'
#' @return A `data.frame`.
#'
#' @examples
#' wide_data <- data.frame(replicate(5, rnorm(10)))
#' # Customizing the names
#' pivot_longer(
#' data = wide_data,
#' cols = c(1, 2),
#' names_to = "Column",
#' values_to = "Numbers"
#' )
#'
#' @export
pivot_longer <- function(
data,
cols,
names_to = "name",
names_prefix = NULL,
names_sep = NULL,
names_pattern = NULL,
values_to = "value",
values_drop_na = FALSE,
...
) {

if (missing(cols)) {
stop("`cols` must select at least one column.")
}

cols <- names(eval_select_pos(data, substitute(cols)))

if (any(names_to %in% setdiff(names(data), cols))) {
stop(
paste0(
"Some values of the columns specified in 'names_to' are already present
as column names. Either use another value in `names_to` or rename the
following columns: ",
paste(names_to[which(names_to %in% setdiff(names(data), cols))], sep = ", ")
),
call. = FALSE)
}

# Sanity checks ----------------

# nothing to select?
if (length(cols) == 0L) {
stop("No columns found for reshaping data.", call. = FALSE)
}

# Reshaping ---------------------
# Create Index column as needed by reshape
data[["_Row"]] <- as.numeric(rownames(data))

# Create a new index for cases with length(names_to) > 1
names_to_2 <- paste(names_to, collapse = "_")

# Reshape
long <- stats::reshape(
data,
varying = cols,
idvar = "_Row",
v.names = values_to,
timevar = names_to_2,
direction = "long"
)

# Cleaning --------------------------
# Sort the dataframe (to match pivot_longer's output)
long <- long[do.call(order, long[, c("_Row", names_to_2)]), ]

long[["_Row"]] <- NULL

# Re-insert col names as levels
long[[names_to_2]] <- cols[long[[names_to_2]]]

# if several variable in names_to, split the names either with names_sep or with names_pattern
if (length(names_to) > 1) {
for (i in seq_along(names_to)) {
if (is.null(names_pattern)) {
new_vals <- unlist(lapply(
strsplit(unique(long[[names_to_2]]), names_sep, fixed = TRUE),
function(x) x[i]
))
long[[names_to[i]]] <- new_vals
} else {
colPattern <- regmatches(
x = unique(long[[names_to_2]]),
m = regexec(names_pattern, unique(long[[names_to_2]]))
)
colPattern <- as.data.frame(do.call(rbind, colPattern))[, c(1, i + 1)]
names(colPattern) <- c(names_to_2, names_to[i])
long <- left_join(x = long, y = colPattern, by = names_to_2)
}
}
long[[names_to_2]] <- NULL
}

# reorder
long <- relocate(.data = long, values_to, .after = -1)

# remove names prefix if specified
if (!is.null(names_prefix)) {
if (length(names_to) > 1) {
stop("`names_prefix` only works when `names_to` is of length 1.", call. = FALSE)
}
long[[names_to]] <- gsub(paste0("^", names_prefix), "", long[[names_to]])
}

if (values_drop_na) {
long <- long[!is.na(long[, values_to]), ]
}

# Reset row names
rownames(long) <- NULL

# Remove reshape attributes
attributes(long)$reshapeLong <- NULL

long
}
217 changes: 217 additions & 0 deletions R/pivot_wider.R
@@ -0,0 +1,217 @@
#' Pivot data from long to wide
#'
#' `pivot_wider()` "widens" data, increasing the number of columns and decreasing the number of rows. The inverse
#' transformation is [pivot_longer()].
#'
#' @param data `data.frame`. The data to pivot.
#' @param id_cols `character(1)`. The name of the column that identifies the rows. If `NULL`, it will use all the unique
#' rows.
#' @param names_from `character(n)`. The name of the column(s) that contains the levels to be used as future column
#' names.
#' @param names_prefix `character(1)`. String added to the start of every variable name. This is particularly useful if
#' `names_from` is a numeric vector and you want to create syntactic variable names.
#' @param names_sep `character(1)`. If `names_from` or `values_from` contains multiple variables, this will be used to
#' join their values together into a single string to use as a column name.
#' @param names_glue `character(1)`. Instead of `names_sep` and `names_prefix`, you can supply a
#' [glue specification](https://glue.tidyverse.org/index.html) that uses the `names_from` columns to create custom
#' column names. Note that the only delimiters supported by `names_glue` are curly brackets, `{` and `}`.
#' @param values_from `character(n)`. The name of the column that contains the values to be used as future variable
#' values.
#' @param values_fill `numeric(n)`. Optionally, a (scalar) value that will be used to replace missing values in the new
#' columns created.
#' @param ... Not used for now.
#'
#' @return If a tibble was provided as input, `pivot_wider()` also returns a
#' tibble. Otherwise, it returns a data frame.
#'
#' @examples
#' data_long <- read.table(header = TRUE, text = "
#' subject sex condition measurement
#' 1 M control 7.9
#' 1 M cond1 12.3
#' 1 M cond2 10.7
#' 2 F control 6.3
#' 2 F cond1 10.6
#' 2 F cond2 11.1
#' 3 F control 9.5
#' 3 F cond1 13.1
#' 3 F cond2 13.8
#' 4 M control 11.5
#' 4 M cond1 13.4
#' 4 M cond2 12.9")
#'
#'
#' pivot_wider(
#' data_long,
#' id_cols = "subject",
#' names_from = "condition",
#' values_from = "measurement"
#' )
#'
#' pivot_wider(
#' data_long,
#' id_cols = "subject",
#' names_from = "condition",
#' values_from = "measurement",
#' names_prefix = "Var.",
#' names_sep = "."
#' )
#'
#' production <- expand.grid(
#' product = c("A", "B"),
#' country = c("AI", "EI"),
#' year = 2000:2014
#' )
#' production <- filter(production, (product == "A" & country == "AI") | product == "B")
#'
#' production$production <- rnorm(nrow(production))
#'
#' pivot_wider(
#' production,
#' names_from = c("product", "country"),
#' values_from = "production",
#' names_glue = "prod_{product}_{country}"
#' )
#'
#' @export
pivot_wider <- function(
data,
id_cols = NULL,
values_from = "Value",
names_from = "Name",
names_sep = "_",
names_prefix = "",
names_glue = NULL,
values_fill = NULL,
...
) {

old_names <- names(data)

# Preserve attributes
variable_attr <- lapply(data, attributes)

# Create an id for stats::reshape
if (is.null(id_cols)) {
data[["_Rows"]] <- apply(
X = data[, !names(data) %in% c(values_from, names_from), drop = FALSE],
MARGIN = 1,
FUN = paste,
collapse = "_"
)
id_cols <- "_Rows"
}

# create pattern of column names - stats::reshape renames columns that concatenates "v.names" + values - we only want
# values
current_colnames <- colnames(data)
current_colnames <- current_colnames[current_colnames != "_Rows"]
if (is.null(names_glue)) {
future_colnames <- unique(apply(data, 1, function(x) paste(x[c(names_from)], collapse = names_sep)))
} else {
vars <- regmatches(names_glue, gregexpr("\\{\\K[^{}]+(?=\\})", names_glue, perl = TRUE))[[1]]
tmp_data <- unique(data[, vars])
future_colnames <- unique(apply(tmp_data, 1, function(x) {
tmp_vars <- list()
for (i in seq_along(vars)) {
tmp_vars[[i]] <- x[vars[i]]
}

tmp_colname <- gsub("\\{\\K[^{}]+(?=\\})", "", names_glue, perl = TRUE)
tmp_colname <- gsub("\\{\\}", "%s", tmp_colname)
do.call(sprintf, c(fmt = tmp_colname, tmp_vars))
}))
}

# stop if some column names would be duplicated (follow tidyr workflow)
if (any(future_colnames %in% current_colnames)) {
stop(
paste0(
"Some values of the columns specified in 'names_from' are already present
as column names. Either use `name_prefix` or rename the following columns: ",
paste(current_colnames[which(current_colnames %in% future_colnames)], sep = ", ")
),
call. = FALSE
)
}

# stats::reshape works strangely when several variables are in idvar/timevar so we unite all ids in a single temporary
# column that will be used by stats::reshape
data$new_time <- apply(data, 1, function(x) paste(x[names_from], collapse = "_"))
data[, names_from] <- NULL

wide <- stats::reshape(
data,
v.names = values_from,
idvar = id_cols,
timevar = "new_time",
sep = names_sep,
direction = "wide"
)

# Clean
if ("_Rows" %in% names(wide)) wide[["_Rows"]] <- NULL
rownames(wide) <- NULL

if (length(values_from) == 1) {
to_rename <- which(startsWith(names(wide), paste0(values_from, names_sep)))
names(wide)[to_rename] <- future_colnames
}

# Order columns as in tidyr
if (length(values_from) > 1) {
for (i in values_from) {
tmp1 <- wide[, which(!startsWith(names(wide), i))]
tmp2 <- wide[, which(startsWith(names(wide), i))]
wide <- cbind(tmp1, tmp2)
# doesn't work
# wide <- relocate(wide, starts_with(i), .after = -1)
}
}

new_cols <- setdiff(names(wide), old_names)
names(wide)[which(names(wide) %in% new_cols)] <- paste0(names_prefix, new_cols)

# Fill missing values
if (!is.null(values_fill)) {
if (length(values_fill) == 1) {
if (is.numeric(wide[[new_cols[1]]])) {
if (!is.numeric(values_fill)) {
stop(paste0("`values_fill` must be of type numeric."), call. = FALSE)
} else {
for (i in new_cols) {
wide[[i]] <- replace_na(wide[[i]], replace = values_fill)
}
}
} else if (is.character(wide[[new_cols[1]]])) {
if (!is.character(values_fill)) {
stop(paste0("`values_fill` must be of type character."), call. = FALSE)
} else {
for (i in new_cols) {
wide[[i]] <- replace_na(wide[[i]], replace = values_fill)
}
}
} else if (is.factor(wide[[new_cols[1]]])) {
if (!is.factor(values_fill)) {
stop(paste0("`values_fill` must be of type factor."), call. = FALSE)
} else {
for (i in new_cols) {
wide[[i]] <- replace_na(wide[[i]], replace = values_fill)
}
}
}
} else {
stop("`values_fill` must be of length 1.", call. = FALSE)
}
}

# Remove reshape attributes
attributes(wide)$reshapeWide <- NULL

# add back attributes where possible
for (i in colnames(wide)) {
attributes(wide[[i]]) <- variable_attr[[i]]
}

wide
}

0 comments on commit 3fcf047

Please sign in to comment.