Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

boxcox() possible optimization #10

Closed
kadyb opened this issue Sep 13, 2021 · 3 comments
Closed

boxcox() possible optimization #10

kadyb opened this issue Sep 13, 2021 · 3 comments

Comments

@kadyb
Copy link

kadyb commented Sep 13, 2021

I compared two implementations of the BoxCox transformation from the {recipes} and {bestNormalize} packages. Package {recipes} is faster and allocates less memory (half). Maybe some improvements can be made in {bestNormalize}? Here is the source code for the function from {recipes}: https://github.com/tidymodels/recipes/blob/master/R/BoxCox.R.

set.seed(123)
df = data.frame(x = rgamma(2e6, 1, 1))

f1 = function(x) {
  bestNormalize::boxcox(x, standardize = FALSE, lower = -5, upper = 5)
}

f2 = function(x) {
  estimates = recipes::prep(x, training = df, retain = TRUE)
  recipes::bake(estimates, new_data = NULL)
}

rec = recipes::recipe( ~ ., data = df)
rec = recipes::step_BoxCox(rec, x)

results = bench::mark(
  iterations = 10, check = FALSE,
  bestNormalize = f1(df$x),
  recipes = f2(rec)
)

results
#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>    <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 bestNormalize    7.49s    7.66s     0.130    2.04GB    1.65
#> 2 recipes          5.77s    5.98s     0.167 1004.42MB    0.902
@petersonR
Copy link
Owner

Thank you for the suggestion. I've implemented a few improvements to boxcox and yeojohnson, which seems to have made them closer to the same in terms of memory and computation time. bestNormalize::boxcox in fact now uses about half as much memory as recipes, I'm guessing because of the line: n <- length(y) in the likelihood function (which could be pulled out).

Code:

set.seed(123)
df = data.frame(x = rgamma(2e6, 1, 1))

bn_yj <- function(x) {
  yeojohnson(x, standardize = FALSE, lower = -5, upper = 5)
}

bn_bc <- function(x) {
  boxcox(x, standardize = FALSE, lower = -5, upper = 5)
}

rec = recipes::recipe( ~ ., data = df)
rec1 <- recipes::step_YeoJohnson(rec, x)
rec2 <- recipes::step_BoxCox(rec, x)

rec_yj <- function(rec) {
  estimates = recipes::prep(rec1, training = df, retain = TRUE)
  recipes::bake(estimates, new_data = NULL)
}

rec_bc <- function(rec) {
  estimates = recipes::prep(rec2, training = df, retain = TRUE)
  recipes::bake(estimates, new_data = NULL)
}

## New implementation from https://github.com/tidymodels/recipes/issues/782
new_rec_yj <- function(dat, limits = c(-5, 5), num_unique = 5, 
                        na_rm = TRUE) {
  na_rows <- which(is.na(dat))
  if (length(na_rows) > 0) {
    if (na_rm) {
      dat <- dat[-na_rows]
    }
    else {
      rlang::abort("Missing values in data. See `na_rm` option")
    }
  }
  eps <- 0.001
  if (length(unique(dat)) < num_unique) 
    return(NA)
  dat_neg <- dat < 0
  ind_neg <- list(is = which(dat_neg), not = which(!dat_neg))
  
  const <- sum(sign(dat) * log(abs(dat) + 1))
  
  res <- optimize(yj_obj, interval = limits, maximum = TRUE, 
                  dat = dat, ind_neg = ind_neg, const = const, tol = 1e-04)
  lam <- res$maximum
  if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= 
      eps) 
    lam <- NA
  lam
}

yj_obj <- function(lam, dat, ind_neg, const) {
  # dat <- dat[complete.cases(dat)]
  ll_yj(lambda = lam, y = dat, ind_neg = ind_neg, const = const)
}

ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) {
  # y <- y[!is.na(y)]
  n <- length(y)
  # nonneg <- all(y > 0)
  y_t <- yj_transform(y, lambda, ind_neg)
  mu_t <- mean(y_t)
  var_t <- var(y_t) * (n - 1)/n
  res <- -0.5 * n * log(var_t) + (lambda - 1) * const
  res
}

yj_transform <- function(x, lambda, ind_neg, eps = 0.001) {
  if (is.na(lambda)) 
    return(x)
  if (!inherits(x, "tbl_df") || is.data.frame(x)) {
    x <- unlist(x, use.names = FALSE)
  }
  else {
    if (!is.vector(x)) 
      x <- as.vector(x)
  }
  not_neg <- ind_neg[["not"]]
  is_neg <- ind_neg[["is"]]
  nn_trans <- function(x, lambda) if (abs(lambda) < eps) 
    log(x + 1)
  else ((x + 1)^lambda - 1)/lambda
  ng_trans <- function(x, lambda) if (abs(lambda - 2) < eps) 
    -log(-x + 1)
  else -((-x + 1)^(2 - lambda) - 1)/(2 - lambda)
  if (length(not_neg) > 0) 
    x[not_neg] <- nn_trans(x[not_neg], lambda)
  if (length(is_neg) > 0) 
    x[is_neg] <- ng_trans(x[is_neg], lambda)
  x
}

# Compare all 
results = bench::mark(
  iterations = 5, check = FALSE,
  bn_yj = bn_yj(df$x),
  rec_yj = rec_yj(rec1),
  new_rec_yj = new_rec_yj(df$x), 
  bn_bc = bn_bc(df$x),
  rec_bc = rec_bc(rec2)
)

results
#  A tibble: 5 × 13
#   expression   min median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result memory time 
#   <bch:expr> <bch> <bch:>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list> <list> <lis>
# 1 bn_yj      1.35s   1.5s     0.675   747.7MB     3.24     5    24       7.4s <NULL> <Rpro… <ben…
# 2 rec_yj     2.56s  2.66s     0.375    2.56GB     5.85     5    78     13.33s <NULL> <Rpro… <ben…
# 3 new_rec_yj  1.2s  1.21s     0.819  672.57MB     2.13     5    13      6.11s <NULL> <Rpro… <ben…
# 4 bn_bc         1s  1.04s     0.947  419.73MB     1.71     5     9      5.28s <NULL> <Rpro… <ben…
# 5 rec_bc     1.54s  1.58s     0.635 1000.25MB     3.17     5    25      7.88s <NULL> <Rpro… <ben…

# ensure correct
x1 <- bn_bc(df$x)
x2 <- rec_bc(rec)

all.equal(x1$x.t, x2$x)
# TRUE
x1 <- bn_yj(df$x)
x2 <- rec_yj(rec)

all.equal(x1$x.t, x2$x)
# TRUE

@kadyb
Copy link
Author

kadyb commented Sep 13, 2021

Thanks, I can confirm that there is an improvement!

set.seed(123)
df = data.frame(x = rgamma(2e6, 1, 1))

install.packages("bestNormalize")
library("bestNormalize")

old_version = bench::mark(
  iterations = 10, check = FALSE,
  YJ_old = yeojohnson(df$x, standardize = FALSE, lower = -5, upper = 5),
  BC_old = boxcox(df$x, standardize = FALSE, lower = -5, upper = 5)
)

old_version
#>   expression     min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:t> <bch:>     <dbl> <bch:byt>    <dbl>
#> 1 YJ_old       6.53s  6.68s     0.149    1.55GB     1.53
#> 2 BC_old       7.68s  7.88s     0.124    2.03GB     1.81

#####
# devel version
detach("package:bestNormalize", unload = TRUE)
devtools::install_github("petersonR/bestNormalize")
library("bestNormalize")

new_version = bench::mark(
  iterations = 10, check = FALSE,
  YJ_new = yeojohnson(df$x, standardize = FALSE, lower = -5, upper = 5),
  BC_new = boxcox(df$x, standardize = FALSE, lower = -5, upper = 5)
)

new_version
#>   expression     min median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:t> <bch:>     <dbl> <bch:byt>    <dbl>
#> 1 YJ_new       4.92s  5.02s     0.196     732MB    0.707
#> 2 BC_new        4.7s  4.76s     0.210     420MB    0.503

@kadyb
Copy link
Author

kadyb commented Sep 28, 2021

There was an update in {recipes} and the timings for BoxCox transformation are now as follows:

#>   expression         min   median `itr/sec` mem_alloc `gc/sec`
#> 1 bestNormalize    4.67s    4.82s     0.209     420MB     1.59
#> 2 recipes          4.46s    4.49s     0.222     295MB    0.777

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants