-
Notifications
You must be signed in to change notification settings - Fork 7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
boxcox()
possible optimization
#10
Comments
Thank you for the suggestion. I've implemented a few improvements to Code: set.seed(123)
df = data.frame(x = rgamma(2e6, 1, 1))
bn_yj <- function(x) {
yeojohnson(x, standardize = FALSE, lower = -5, upper = 5)
}
bn_bc <- function(x) {
boxcox(x, standardize = FALSE, lower = -5, upper = 5)
}
rec = recipes::recipe( ~ ., data = df)
rec1 <- recipes::step_YeoJohnson(rec, x)
rec2 <- recipes::step_BoxCox(rec, x)
rec_yj <- function(rec) {
estimates = recipes::prep(rec1, training = df, retain = TRUE)
recipes::bake(estimates, new_data = NULL)
}
rec_bc <- function(rec) {
estimates = recipes::prep(rec2, training = df, retain = TRUE)
recipes::bake(estimates, new_data = NULL)
}
## New implementation from https://github.com/tidymodels/recipes/issues/782
new_rec_yj <- function(dat, limits = c(-5, 5), num_unique = 5,
na_rm = TRUE) {
na_rows <- which(is.na(dat))
if (length(na_rows) > 0) {
if (na_rm) {
dat <- dat[-na_rows]
}
else {
rlang::abort("Missing values in data. See `na_rm` option")
}
}
eps <- 0.001
if (length(unique(dat)) < num_unique)
return(NA)
dat_neg <- dat < 0
ind_neg <- list(is = which(dat_neg), not = which(!dat_neg))
const <- sum(sign(dat) * log(abs(dat) + 1))
res <- optimize(yj_obj, interval = limits, maximum = TRUE,
dat = dat, ind_neg = ind_neg, const = const, tol = 1e-04)
lam <- res$maximum
if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <=
eps)
lam <- NA
lam
}
yj_obj <- function(lam, dat, ind_neg, const) {
# dat <- dat[complete.cases(dat)]
ll_yj(lambda = lam, y = dat, ind_neg = ind_neg, const = const)
}
ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) {
# y <- y[!is.na(y)]
n <- length(y)
# nonneg <- all(y > 0)
y_t <- yj_transform(y, lambda, ind_neg)
mu_t <- mean(y_t)
var_t <- var(y_t) * (n - 1)/n
res <- -0.5 * n * log(var_t) + (lambda - 1) * const
res
}
yj_transform <- function(x, lambda, ind_neg, eps = 0.001) {
if (is.na(lambda))
return(x)
if (!inherits(x, "tbl_df") || is.data.frame(x)) {
x <- unlist(x, use.names = FALSE)
}
else {
if (!is.vector(x))
x <- as.vector(x)
}
not_neg <- ind_neg[["not"]]
is_neg <- ind_neg[["is"]]
nn_trans <- function(x, lambda) if (abs(lambda) < eps)
log(x + 1)
else ((x + 1)^lambda - 1)/lambda
ng_trans <- function(x, lambda) if (abs(lambda - 2) < eps)
-log(-x + 1)
else -((-x + 1)^(2 - lambda) - 1)/(2 - lambda)
if (length(not_neg) > 0)
x[not_neg] <- nn_trans(x[not_neg], lambda)
if (length(is_neg) > 0)
x[is_neg] <- ng_trans(x[is_neg], lambda)
x
}
# Compare all
results = bench::mark(
iterations = 5, check = FALSE,
bn_yj = bn_yj(df$x),
rec_yj = rec_yj(rec1),
new_rec_yj = new_rec_yj(df$x),
bn_bc = bn_bc(df$x),
rec_bc = rec_bc(rec2)
)
results
# A tibble: 5 × 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time
# <bch:expr> <bch> <bch:> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <lis>
# 1 bn_yj 1.35s 1.5s 0.675 747.7MB 3.24 5 24 7.4s <NULL> <Rpro… <ben…
# 2 rec_yj 2.56s 2.66s 0.375 2.56GB 5.85 5 78 13.33s <NULL> <Rpro… <ben…
# 3 new_rec_yj 1.2s 1.21s 0.819 672.57MB 2.13 5 13 6.11s <NULL> <Rpro… <ben…
# 4 bn_bc 1s 1.04s 0.947 419.73MB 1.71 5 9 5.28s <NULL> <Rpro… <ben…
# 5 rec_bc 1.54s 1.58s 0.635 1000.25MB 3.17 5 25 7.88s <NULL> <Rpro… <ben…
# ensure correct
x1 <- bn_bc(df$x)
x2 <- rec_bc(rec)
all.equal(x1$x.t, x2$x)
# TRUE
x1 <- bn_yj(df$x)
x2 <- rec_yj(rec)
all.equal(x1$x.t, x2$x)
# TRUE |
Thanks, I can confirm that there is an improvement! set.seed(123)
df = data.frame(x = rgamma(2e6, 1, 1))
install.packages("bestNormalize")
library("bestNormalize")
old_version = bench::mark(
iterations = 10, check = FALSE,
YJ_old = yeojohnson(df$x, standardize = FALSE, lower = -5, upper = 5),
BC_old = boxcox(df$x, standardize = FALSE, lower = -5, upper = 5)
)
old_version
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:> <dbl> <bch:byt> <dbl>
#> 1 YJ_old 6.53s 6.68s 0.149 1.55GB 1.53
#> 2 BC_old 7.68s 7.88s 0.124 2.03GB 1.81
#####
# devel version
detach("package:bestNormalize", unload = TRUE)
devtools::install_github("petersonR/bestNormalize")
library("bestNormalize")
new_version = bench::mark(
iterations = 10, check = FALSE,
YJ_new = yeojohnson(df$x, standardize = FALSE, lower = -5, upper = 5),
BC_new = boxcox(df$x, standardize = FALSE, lower = -5, upper = 5)
)
new_version
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:t> <bch:> <dbl> <bch:byt> <dbl>
#> 1 YJ_new 4.92s 5.02s 0.196 732MB 0.707
#> 2 BC_new 4.7s 4.76s 0.210 420MB 0.503 |
There was an update in #> expression min median `itr/sec` mem_alloc `gc/sec`
#> 1 bestNormalize 4.67s 4.82s 0.209 420MB 1.59
#> 2 recipes 4.46s 4.49s 0.222 295MB 0.777 |
I compared two implementations of the BoxCox transformation from the
{recipes}
and{bestNormalize}
packages. Package{recipes}
is faster and allocates less memory (half). Maybe some improvements can be made in{bestNormalize}
? Here is the source code for the function from{recipes}
: https://github.com/tidymodels/recipes/blob/master/R/BoxCox.R.The text was updated successfully, but these errors were encountered: