-
Notifications
You must be signed in to change notification settings - Fork 32
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
mutate.
doesn't use lazy evaluation when .by
is used
#166
Comments
Unfortunately this one is going to have to stay as-is. Your first example works because tidytable uses a loop to evaluate However when you supply That being said, the documentation could definitely be updated to mention that this use case doesn't work. I'll be sure to add something to clarify this |
I think one way to solve this is instead of looping we do a chained |
There are a couple cases for
Without pacman::p_load(tidytable, data.table)
data_size <- 1000000
test_df <- tidytable(
a = sample(letters[1:5], data_size, TRUE),
b = sample(1:100, data_size, TRUE),
c = sample(1:100, data_size, TRUE)
)
test_dt <- as.data.table(test_df)
bench::mark(
dt_one_call = test_dt[, ':='(double_b = b * 2, double_c = c * 2)][],
dt_chained = test_dt[, double_b := b * 2][, double_c := c * 2][],
tidytable = mutate.(test_df, double_b = b * 2, double_c = c * 2),
check = FALSE, iterations = 50
)
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dt_one_call 5.33ms 6.61ms 128. 16.9MB 92.7
#> 2 dt_chained 5.18ms 5.83ms 160. 15.3MB 89.9
#> 3 tidytable 5.74ms 6.33ms 146. 15.4MB 56.6 With pacman::p_load(tidytable, data.table)
data_size <- 1000000
test_df <- tidytable(
a = sample(letters[1:5], data_size, TRUE),
b = sample(1:100, data_size, TRUE),
c = sample(1:100, data_size, TRUE)
)
test_dt <- as.data.table(test_df)
bench::mark(
dt_one_call = test_dt[, ':='(double_b = b * 2, double_c = c * 2), by = a][],
dt_chained = test_dt[, double_b := b * 2, by = a][, double_c := c * 2, by = a][],
tidytable = mutate.(test_df, double_b = b * 2, double_c = c * 2, .by = a),
check = FALSE, iterations = 50
)
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dt_one_call 25.5ms 28.4ms 33.1 39MB 49.6
#> 2 dt_chained 39.8ms 44.8ms 21.2 30.6MB 29.2
#> 3 tidytable 29.5ms 33.2ms 29.3 38.1MB 137. So chaining would allow the user to refer to newly created columns, but it would also cost them tons of time. |
I can understand why chaining costs a lot of time when grouping (the groups are recalculated each time) bench::mark(
dt_one_call_lazy = test_dft[, ':='(c("double_b", "double_c"),{double_b = b * 2; quad_b = double_b * 2; .(double_b, quad_b)}), by = a][],
dt_chained = test_dft[, double_b := b * 2, by = a][, quad_b := double_b * 2, by = a][],
tidytable = mutate.(mutate.(test_df, double_b = b, .by = a), quad_b = b * 2, .by = a),
check = FALSE, iterations = 50
)
#> # A tibble: 3 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 dt_one_call_with_lazy 42.8ms 47.6ms 21.1 23.7MB 17.9 27 23 1.28s
#> 2 dt_chained 54.7ms 61.2ms 16.3 37.5MB 17.6 24 26 1.47s
#> 3 tidytable 45ms 50.5ms 19.8 31.4MB 19.8 25 25 1.26s Here is my try at implementing this: mutate..fast.frame <- function(.df, ..., .by = NULL) {
.df <- as_tidytable(.df)
.df <- data.table::copy(.df)
dots <- enquos(...)
.by <- enquo(.by)
all_names <- names(dots)
dots_text <- sapply(dots, quo_text)
null_flag <- sapply(dots, quo_is_null)
str_extract(dots_text, "[\\w.\\d]+(?![\\w.\\d]*\\()", perl=T) -> vars_in
sapply(vars_in, function(.x) all(.x %notin% all_names)) -> vars_not_in
if (quo_is_null(.by)) {
# Faster version if there is no "by" provided
data_env <- env(quo_get_env(dots[[1]]), .df = .df)
if(all(vars_not_in)){
.df <- tidytable:::eval_quo(
.df[, ':='(!!!dots[!null_flag])],
new_data_mask(data_env), env = caller_env()
)
}else{
dots_expr <- parse_expr(paste0("{", paste0(sprintf("%s <- %s", all_names[!null_flag], dots_text[!null_flag]), collapse="\n"),"\n.(",paste0(unique(all_names[!null_flag]), collapse=", "),")}"))
.df <- tidytable:::eval_quo(
.df[, ':='(!!unique(all_names[!null_flag]), !!dots_expr)],
new_data_mask(data_env), env = caller_env()
)
}
} else {
# Faster with "by", since the "by" call isn't looped multiple times for each column added
.by <- tidytable:::select_vec_chr(.df, !!.by)
data_env <- env(quo_get_env(dots[[1]]), .df = .df, .by = .by)
if(all(vars_not_in)){
.df <- tidytable:::eval_quo(
.df[, ':='(!!!dots[!null_flag]), by= .by],
new_data_mask(data_env), env = caller_env()
)
}else{
dots_expr <- parse_expr(paste0("{", paste0(sprintf("%s <- %s", all_names[!null_flag], dots_text[!null_flag]), collapse="\n"),"\n.(",paste0(unique(all_names[!null_flag]), collapse=", "),")}"))
.df <- tidytable:::eval_quo(
.df[, ':='(!!unique(all_names[!null_flag]), !!dots_expr), by= .by],
new_data_mask(data_env), env = caller_env()
)
}
}
if(any(null_flag)){
.df <- tidytable:::eval_quo(
.df[, ':='(!!!dots[null_flag])],
new_data_mask(data_env), env = caller_env()
)
}
.df[]
}
library(data.table)
test_df <- as_tidytable(matrix(sample(1:10, 200, replace=T), ncol= 20))
copy(test_df) -> fast_df
class(fast_df) <- c( "fast.frame", class(fast_df))
bench::mark(
fast_lazy = fast_df %>% mutate.(m=cumsum(V1), z=mean(V2)-m, l=sum(V20)-z, z=NULL),
tidytable_lazy = test_df %>% mutate.(m=cumsum(V1), z=mean(V2)-m, l=sum(V20)-z, z=NULL),
fast = fast_df %>% mutate.(m=cumsum(V1), z=mean(V2), l=sum(V20)),
tidytable = test_df %>% mutate.(m=cumsum(V1), z=mean(V2), l=sum(V20)),
fast_lazy_by = fast_df %>% mutate.(m=cumsum(V1), z=mean(V2)-m, l=sum(V20)-z, .by=V10),
tidytable_lazy_by = test_df %>% mutate.(m=cumsum(V1), .by=V10) %>% mutate.(z=mean(V2)-m, .by=V10) %>% mutate.(l=sum(V20)-z, .by=V10),
fast_by = fast_df %>% mutate.(m=cumsum(V1), z=mean(V2), l=sum(V20), .by=V10),
tidytable_by = test_df %>% mutate.(m=cumsum(V1), z=mean(V2), l=sum(V20), .by=V10),
check=F, iterations=100)
#> # A tibble: 8 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 fast_lazy 2.07ms 2.28ms 410. 91.7KB 8.36 98 2 239ms
#> 2 tidytable_lazy 2.1ms 2.56ms 345. 160.6KB 10.7 97 3 281ms
#> 3 fast 1.4ms 1.78ms 492. 62.2KB 4.97 99 1 201ms
#> 4 tidytable 1.68ms 2.27ms 395. 126.1KB 8.06 98 2 248ms
#> 5 fast_lazy_by 2.83ms 3.07ms 305. 88.9KB 9.43 97 3 318ms
#> 6 tidytable_lazy_by 6.35ms 8.53ms 111. 201.7KB 7.07 94 6 849ms
#> 7 fast_by 2.57ms 3.15ms 281. 85.4KB 5.73 98 2 349ms
#> 8 tidytable_by 2.15ms 2.34ms 410. 61.4KB 8.37 98 2 239ms |
I haven't had a chance to look at this too in depth, but it's worth noting that the call to fast_df %>%
mutate.(V1 = 1) You'll see that fast_df$V1 becomes overwritten as 1 even without assignment. |
I replaced pacman::p_load(tidytable, data.table)
test_df <- as_tidytable(matrix(sample(1:1000, 20000, replace=T), ncol= 20))
copy(test_df) -> fast_df
class(fast_df) <- c( "fast.frame", class(fast_df))
bench::mark(
fast_lazy = fast_df %>% mutate.(m=cumsum(V1), z=mean(V2)-m, l=sum(V20)-z, z=NULL),
tidytable_lazy = test_df %>% mutate.(m=cumsum(V1), z=mean(V2)-m, l=sum(V20)-z, z=NULL),
fast = fast_df %>% mutate.(m=cumsum(V1), z=mean(V2), l=sum(V20)),
tidytable = test_df %>% mutate.(m=cumsum(V1), z=mean(V2), l=sum(V20)),
fast_lazy_by = fast_df %>% mutate.(m=cumsum(V1), z=mean(V2)-m, l=sum(V20)-z, .by=V10),
tidytable_lazy_by = test_df %>% mutate.(m=cumsum(V1), .by=V10) %>% mutate.(z=mean(V2)-m, .by=V10) %>% mutate.(l=sum(V20)-z, .by=V10),
fast_by = fast_df %>% mutate.(m=cumsum(V1), z=mean(V2), l=sum(V20), .by=V10),
tidytable_by = test_df %>% mutate.(m=cumsum(V1), z=mean(V2), l=sum(V20), .by=V10),
check=F, iterations=1000)
#> # A tibble: 8 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 fast_lazy 1.77ms 2.19ms 387. 228.4KB 5.09 987 13 2.55s
#> 2 tidytable_lazy 2.09ms 2.51ms 337. 180KB 5.14 985 15 2.92s
#> 3 fast 1.04ms 1.22ms 723. 159.5KB 5.09 993 7 1.37s
#> 4 tidytable 1.7ms 2.22ms 387. 141.5KB 5.09 987 13 2.55s
#> 5 fast_lazy_by 6.63ms 7.05ms 136. 205.4KB 6.86 952 48 7s
#> 6 tidytable_lazy_by 10.29ms 11.05ms 85.6 260.3KB 6.34 931 69 10.88s
#> 7 fast_by 3.4ms 3.63ms 251. 198KB 5.65 978 22 3.9s
#> 8 tidytable_by 3.32ms 3.5ms 268. 96.1KB 5.76 979 21 3.65s
## same but with microbenchmark
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> fast_lazy 1.792592 1.951818 2.201979 2.029412 2.153349 36.161678 1000 b
#> tidytable_lazy 2.111174 2.257271 2.533366 2.347788 2.479777 13.400589 1000 c
#> fast 1.048814 1.169453 1.319120 1.234040 1.329413 7.828332 1000 a
#> tidytable 1.716752 1.848088 2.265266 1.932467 2.066553 184.878610 1000 bc
#> fast_lazy_by 6.543140 6.813625 7.453086 6.946508 7.336132 16.891461 1000 e
#> tidytable_lazy_by 10.242606 10.657886 11.708278 10.889841 11.734166 40.886979 1000 f
#> fast_by 3.336612 3.522397 3.819963 3.601099 3.748503 12.468878 1000 d
#> tidytable_by 3.277034 3.442115 3.753222 3.527770 3.709806 10.903940 1000 d Note : I reupdated the code above as the datamask didn't contain
test_df %>% tidytable::mutate.(z=1,z=NULL, z=.N, .by=V1)
#> Error in `[.data.table`(.df, , `:=`(z = 1, z = NULL, z = .N), by = .by) :
#> RHS is NULL when grouping :=. Makes no sense to delete a column by group. Perhaps use an empty vector instead. the final versionIt correctly handles all the issues mentioned above the mutate..fast.frame <- function(.df, ..., .by = NULL) {
.df <- as_tidytable(.df)
.df <- data.table::copy(.df)
dots <- enquos(...)
.by <- enquo(.by)
all_names <- names(dots)
dots_text <- sapply(dots, quo_text)
null_flag <- sapply(dots, quo_is_null)
str_extract(dots_text, "[\\w.\\d]+(?![\\w.\\d]*\\()", perl=T) -> vars_in
sapply(vars_in, function(.x) all(.x %notin% all_names)) -> vars_not_in
uni_non_null <- unique(non_null <- all_names[!null_flag])
if (quo_is_null(.by)) {
# Faster version if there is no "by" provided
data_env <- env(quo_get_env(dots[[1]]), .df = .df)
if(all(vars_not_in)){
#dot <- dots[!null_flag]
#if(length(u_o <- unique(dot))<length(dot)) dot<-dot[length(dot)-match(u_o,rev(dot))+1]rev(dots)[uni_non_null]
if(length(uni_non_null) < length(non_null)) dots <- rev(dots)[uni_non_null]
tidytable:::eval_quo(
.df[, ':='(!!!dots)],
new_data_mask(data_env), env = caller_env()
)
}else{
dots_expr <- parse_expr(paste0("{", paste0(sprintf("%s <- %s", non_null, dots_text[!null_flag]), collapse="\n"),"\n.(",paste0(uni_non_null, collapse=", "),")}"))
tidytable:::eval_quo(
.df[, ':='(!!uni_non_null, !!dots_expr)],
new_data_mask(data_env), env = caller_env()
)
}
} else {
# Faster with "by", since the "by" call isn't looped multiple times for each column added
.by <- tidytable:::select_vec_chr(.df, !!.by)
data_env <- env(quo_get_env(dots[[1]]), .df = .df, .by = .by)
if(all(vars_not_in)){
#dot <- dots[!null_flag]
if(length(uni_non_null) < length(non_null)) dots <- rev(dots)[uni_non_null]
#if(length(u_o <- unique(dot))<length(dot)) dot<-dot[length(dot)-match(u_o,rev(dot))+1]
tidytable:::eval_quo(
.df[, ':='(!!!rev(dots)[uni_non_null]), by= .by],
new_data_mask(data_env), env = caller_env()
)
}else{
dots_expr <- parse_expr(paste0("{", paste0(sprintf("%s <- %s", non_null, dots_text[!null_flag]), collapse="\n"),"\n.(",paste0(uni_non_null, collapse=", "),")}"))
tidytable:::eval_quo(
.df[, ':='(!!uni_non_null, !!dots_expr), by= .by],
new_data_mask(data_env), env = caller_env()
)
}
}
if(any(null_flag)){
null_flag <- sapply(unique(all_names[null_flag]), function(nm, null_flags=null_flag[all_names == nm]) if(any(!null_flags)) max(which(!null_flags)) < length(null_flags) else T)
if(any(null_flag)){
tidytable:::eval_quo(
.df[, ':='(!!names(null_flag[null_flag]), NULL)],
new_data_mask(data_env), env = caller_env()
)
}
}
.df[]
}
## running the same benchmarks as above
#> # A tibble: 8 x 13
#> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#> 1 fast_lazy 1.75ms 2.12ms 426. 244.5KB 3.00 993 7 2.33s
#> 2 tidytable_lazy 2.1ms 2.59ms 326. 180KB 3.30 990 10 3.03s
#> 3 fast 1.07ms 1.32ms 703. 159.5KB 2.82 996 4 1.42s
#> 4 tidytable 1.71ms 2.05ms 452. 141.5KB 3.65 992 8 2.19s
#> 5 fast_lazy_by 6.97ms 7.64ms 126. 205.4KB 3.76 971 29 7.71s
#> 6 tidytable_lazy_by 10.78ms 11.81ms 82.7 260.3KB 3.80 956 44 11.56s
#> 7 fast_by 3.5ms 3.89ms 246. 198KB 3.24 987 13 4.01s
#> 8 tidytable_by 3.42ms 3.87ms 240. 96.1KB 3.41 986 14 4.1s
## same but with microbenchmark
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> fast_lazy 1.787142 2.018074 2.415464 2.172018 2.523350 11.907236 1000 bc
#> tidytable_lazy 2.132988 2.352558 2.815394 2.526966 2.877467 16.380961 1000 c
#> fast 1.065853 1.249510 1.491947 1.373019 1.570780 7.188465 1000 a
#> tidytable 1.737633 1.935615 2.294682 2.069154 2.338886 10.201034 1000 b
#> fast_lazy_by 6.675199 7.140231 8.481409 7.545871 8.640252 158.612191 1000 e
#> tidytable_lazy_by 10.528965 11.404705 13.514930 12.182135 14.033036 203.728897 1000 f
#> fast_by 3.398774 3.726079 4.388323 3.950813 4.610042 17.714789 1000 d
#> tidytable_by 3.307709 3.625800 4.324662 3.863863 4.488226 16.821681 1000 d |
Hmm I'll have to think about this one for a bit. My initial thought is that though it is faster, I'm still trying to avoid the string manipulation workflow wherever possible. |
All set: pacman::p_load(tidytable)
df <- tidytable(x = rep(1, 3), y = c("a", "a", "b"))
df %>%
mutate.(x = x + 1, z = x + 10, .by = y)
#> # tidytable [3 × 3]
#> x y z
#> <dbl> <chr> <dbl>
#> 1 2 a 12
#> 2 2 a 12
#> 3 2 b 12 |
Here is a minimal reprex:
The text was updated successfully, but these errors were encountered: