`boxcox()` possible optimization #10

kadyb · 2021-09-13T11:28:19Z

I compared two implementations of the BoxCox transformation from the {recipes} and {bestNormalize} packages. Package {recipes} is faster and allocates less memory (half). Maybe some improvements can be made in {bestNormalize}? Here is the source code for the function from {recipes}: https://github.com/tidymodels/recipes/blob/master/R/BoxCox.R.

set.seed(123)
df = data.frame(x = rgamma(2e6, 1, 1))

f1 = function(x) {
  bestNormalize::boxcox(x, standardize = FALSE, lower = -5, upper = 5)
}

f2 = function(x) {
  estimates = recipes::prep(x, training = df, retain = TRUE)
  recipes::bake(estimates, new_data = NULL)
}

rec = recipes::recipe( ~ ., data = df)
rec = recipes::step_BoxCox(rec, x)

results = bench::mark(
  iterations = 10, check = FALSE,
  bestNormalize = f1(df$x),
  recipes = f2(rec)
)

results
#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 bestNormalize    7.49s    7.66s     0.130    2.04GB    1.65
#> 2 recipes          5.77s    5.98s     0.167 1004.42MB    0.902

The text was updated successfully, but these errors were encountered:

petersonR · 2021-09-13T16:17:33Z

Thank you for the suggestion. I've implemented a few improvements to boxcox and yeojohnson, which seems to have made them closer to the same in terms of memory and computation time. bestNormalize::boxcox in fact now uses about half as much memory as recipes, I'm guessing because of the line: n <- length(y) in the likelihood function (which could be pulled out).

Code:

set.seed(123)
df = data.frame(x = rgamma(2e6, 1, 1))

bn_yj <- function(x) {
  yeojohnson(x, standardize = FALSE, lower = -5, upper = 5)
}

bn_bc <- function(x) {
  boxcox(x, standardize = FALSE, lower = -5, upper = 5)
}

rec = recipes::recipe( ~ ., data = df)
rec1 <- recipes::step_YeoJohnson(rec, x)
rec2 <- recipes::step_BoxCox(rec, x)

rec_yj <- function(rec) {
  estimates = recipes::prep(rec1, training = df, retain = TRUE)
  recipes::bake(estimates, new_data = NULL)
}

rec_bc <- function(rec) {
  estimates = recipes::prep(rec2, training = df, retain = TRUE)
  recipes::bake(estimates, new_data = NULL)
}

## New implementation from https://github.com/tidymodels/recipes/issues/782
new_rec_yj <- function(dat, limits = c(-5, 5), num_unique = 5, 
                        na_rm = TRUE) {
  na_rows <- which(is.na(dat))
  if (length(na_rows) > 0) {
    if (na_rm) {
      dat <- dat[-na_rows]
    }
    else {
      rlang::abort("Missing values in data. See `na_rm` option")
    }
  }
  eps <- 0.001
  if (length(unique(dat)) < num_unique) 
    return(NA)
  dat_neg <- dat < 0
  ind_neg <- list(is = which(dat_neg), not = which(!dat_neg))
  
  const <- sum(sign(dat) * log(abs(dat) + 1))
  
  res <- optimize(yj_obj, interval = limits, maximum = TRUE, 
                  dat = dat, ind_neg = ind_neg, const = const, tol = 1e-04)
  lam <- res$maximum
  if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= 
      eps) 
    lam <- NA
  lam
}

yj_obj <- function(lam, dat, ind_neg, const) {
  # dat <- dat[complete.cases(dat)]
  ll_yj(lambda = lam, y = dat, ind_neg = ind_neg, const = const)
}

ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) {
  # y <- y[!is.na(y)]
  n <- length(y)
  # nonneg <- all(y > 0)
  y_t <- yj_transform(y, lambda, ind_neg)
  mu_t <- mean(y_t)
  var_t <- var(y_t) * (n - 1)/n
  res <- -0.5 * n * log(var_t) + (lambda - 1) * const
  res
}

yj_transform <- function(x, lambda, ind_neg, eps = 0.001) {
  if (is.na(lambda)) 
    return(x)
  if (!inherits(x, "tbl_df") || is.data.frame(x)) {
    x <- unlist(x, use.names = FALSE)
  }
  else {
    if (!is.vector(x)) 
      x <- as.vector(x)
  }
  not_neg <- ind_neg[["not"]]
  is_neg <- ind_neg[["is"]]
  nn_trans <- function(x, lambda) if (abs(lambda) < eps) 
    log(x + 1)
  else ((x + 1)^lambda - 1)/lambda
  ng_trans <- function(x, lambda) if (abs(lambda - 2) < eps) 
    -log(-x + 1)
  else -((-x + 1)^(2 - lambda) - 1)/(2 - lambda)
  if (length(not_neg) > 0) 
    x[not_neg] <- nn_trans(x[not_neg], lambda)
  if (length(is_neg) > 0) 
    x[is_neg] <- ng_trans(x[is_neg], lambda)
  x
}

# Compare all 
results = bench::mark(
  iterations = 5, check = FALSE,
  bn_yj = bn_yj(df$x),
  rec_yj = rec_yj(rec1),
  new_rec_yj = new_rec_yj(df$x), 
  bn_bc = bn_bc(df$x),
  rec_bc = rec_bc(rec2)
)

results
#  A tibble: 5 × 13
#   expression   min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory time 
#   <bch:expr> <bch> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list> <lis>
# 1 bn_yj      1.35s   1.5s     0.675   747.7MB     3.24     5    24       7.4s <NULL> <Rpro… <ben…
# 2 rec_yj     2.56s  2.66s     0.375    2.56GB     5.85     5    78     13.33s <NULL> <Rpro… <ben…
# 3 new_rec_yj  1.2s  1.21s     0.819  672.57MB     2.13     5    13      6.11s <NULL> <Rpro… <ben…
# 4 bn_bc         1s  1.04s     0.947  419.73MB     1.71     5     9      5.28s <NULL> <Rpro… <ben…
# 5 rec_bc     1.54s  1.58s     0.635 1000.25MB     3.17     5    25      7.88s <NULL> <Rpro… <ben…

# ensure correct
x1 <- bn_bc(df$x)
x2 <- rec_bc(rec)

all.equal(x1$x.t, x2$x)
# TRUE
x1 <- bn_yj(df$x)
x2 <- rec_yj(rec)

all.equal(x1$x.t, x2$x)
# TRUE

kadyb · 2021-09-13T17:34:06Z

Thanks, I can confirm that there is an improvement!

set.seed(123)
df = data.frame(x = rgamma(2e6, 1, 1))

install.packages("bestNormalize")
library("bestNormalize")

old_version = bench::mark(
  iterations = 10, check = FALSE,
  YJ_old = yeojohnson(df$x, standardize = FALSE, lower = -5, upper = 5),
  BC_old = boxcox(df$x, standardize = FALSE, lower = -5, upper = 5)
)

old_version
#>   expression     min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:t> <bch:>     <dbl> <bch:byt>    <dbl>
#> 1 YJ_old       6.53s  6.68s     0.149    1.55GB     1.53
#> 2 BC_old       7.68s  7.88s     0.124    2.03GB     1.81

#####
# devel version
detach("package:bestNormalize", unload = TRUE)
devtools::install_github("petersonR/bestNormalize")
library("bestNormalize")

new_version = bench::mark(
  iterations = 10, check = FALSE,
  YJ_new = yeojohnson(df$x, standardize = FALSE, lower = -5, upper = 5),
  BC_new = boxcox(df$x, standardize = FALSE, lower = -5, upper = 5)
)

new_version
#>   expression     min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:t> <bch:>     <dbl> <bch:byt>    <dbl>
#> 1 YJ_new       4.92s  5.02s     0.196     732MB    0.707
#> 2 BC_new        4.7s  4.76s     0.210     420MB    0.503

kadyb · 2021-09-28T08:12:57Z

There was an update in {recipes} and the timings for BoxCox transformation are now as follows:

#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#> 1 bestNormalize    4.67s    4.82s     0.209     420MB     1.59
#> 2 recipes          4.46s    4.49s     0.222     295MB    0.777

petersonR closed this as completed Sep 13, 2021

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

`boxcox()` possible optimization #10

`boxcox()` possible optimization #10

kadyb commented Sep 13, 2021

petersonR commented Sep 13, 2021

kadyb commented Sep 13, 2021

kadyb commented Sep 28, 2021

boxcox() possible optimization #10

boxcox() possible optimization #10

Comments

kadyb commented Sep 13, 2021

petersonR commented Sep 13, 2021

kadyb commented Sep 13, 2021

kadyb commented Sep 28, 2021

`boxcox()` possible optimization #10

`boxcox()` possible optimization #10