Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
+ added `groceryElog` dataset + use `groceryElog` dataset for demos that can leverage regularity fixes #38
- Loading branch information
Showing
9 changed files
with
179 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#' Event log for customers of an online grocery store. | ||
#' | ||
#' These data came from an online retailer offering a broad range of grocery | ||
#' categories. The original data set spans four years, but lacked the customers' | ||
#' acquisition date. Therefore, we constructed a quasi cohort by limiting the | ||
#' provided data analysis to those customers who haven't purchased at all in the | ||
#' first two years, and had their first purchase in the first quarter of 2006. | ||
#' This resulted in 10483 transactions being recorded for 1525 customers during | ||
#' a period of two years (2006-2007). | ||
#' | ||
#' @references Platzer, Michael, and Thomas Reutterer. 'Ticking Away the | ||
#' Moments: Timing Regularity Helps to Better Predict Customer Activity.' | ||
#' Marketing Science (2016). | ||
#' | ||
#' @format A data frame with 10483 rows and 2 variables: \describe{ | ||
#' \item{cust}{customer ID, factor vector} \item{date}{transaction date, | ||
#' Date vector} } | ||
"groceryElog" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
cdnow Demonstration of fitting various models to the CDNow data set | ||
mbg-cnbd-k Demonstration of MBG/CNBD-k model | ||
pareto-abe Demonstration of Abe's Pareto/NBD variant | ||
pareto-ggg Demonstration of Pareto/NBD (HB) & Pareto/GGG model | ||
cdnow Demonstration of fitting various models to the CDNow dataset | ||
mbg-cnbd-k Demonstration of MBG/CNBD-k model with grocery dataset | ||
pareto-abe Demonstration of Abe's Pareto/NBD variant with CDNow dataset | ||
pareto-ggg Demonstration of Pareto/NBD (HB) & Pareto/GGG model with grocery dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,89 +1,110 @@ | ||
#' Load transaction records of 1525 grocery customers. | ||
data("groceryElog", envir = environment()) | ||
head(groceryElog) | ||
|
||
#' Simulate artificial MBG/CNBD-3 data. | ||
set.seed(1234) | ||
n <- 4000 # number of customers | ||
T.cal <- round(runif(n, 24, 32)/4)*4 # 24-32 weeks of calibration period | ||
T.star <- 32 # 32 weeks of hold-out period | ||
params <- c(k = 3, # regularity in interpurchase-times (Erlang-k) | ||
r = 0.85, alpha = 1.45, # purchase frequency lambda_i ~ Gamma(r, alpha) | ||
a = 0.79, b = 2.42) # dropout probability p_i ~ Beta(a, b) | ||
data <- mbgcnbd.GenerateData(n, T.cal, T.star, params, return.elog = TRUE) | ||
|
||
cbs <- data$cbs # CBS summary - one record per customer | ||
#' Convert from event log to customer-by-sufficient-statistic summary. | ||
#' Split into 52 weeks calibration, and 52 weeks holdout period. | ||
cbs <- elog2cbs(groceryElog, T.cal = "2006-12-31", T.tot = "2007-12-30") | ||
head(cbs) | ||
|
||
elog <- data$elog # Event log - one row per event/purchase | ||
head(elog) | ||
|
||
|
||
x <- readline("Estimate regularity via Wheat/Morrison estimator (press Enter)") | ||
|
||
(k.est <- estimateRegularity(elog)) | ||
#' -> Wheat-Morrison estimator correctly detects Erlang-3. | ||
(k.est <- estimateRegularity(groceryElog)) | ||
#' -> Wheat-Morrison estimator detects Erlang-2. | ||
|
||
#' Plot Timing Patterns of a few sampled customers | ||
plotSampledTimingPatterns(groceryElog, T.cal = "2006-12-31") | ||
|
||
|
||
x <- readline("Estimate MBG/CNBD-k model (press Enter)") | ||
|
||
params.mbgcnbd <- mbgcnbd.EstimateParameters(cbs) | ||
params.pnbd <- BTYD::pnbd.EstimateParameters(cbs) | ||
rbind(`Actual` = params, | ||
`MBG/CNBD-k` = round(params.mbgcnbd, 2), | ||
`Pareto/NBD` = c(1, round(params.pnbd, 2))) | ||
#' -> Underlying parameters are successfully recovered. | ||
# Estimate MBG/CNBD-k model parameters. | ||
(params.mbgcnbd <- mbgcnbd.EstimateParameters(cbs)) | ||
#' k=2 -> regularity also detected via MBG/CNBD-k model | ||
|
||
# Predict transactions at customer level with MBG/CNBD-k model. | ||
cbs$xstar.mbgcnbd <- mbgcnbd.ConditionalExpectedTransactions( | ||
params = params.mbgcnbd, | ||
T.star = cbs$T.star, | ||
x = cbs$x, | ||
t.x = cbs$t.x, | ||
T.cal = cbs$T.cal) | ||
|
||
x <- readline("Estimate transactions during holdout period (press Enter)") | ||
# Estimate total transactions during holdout, based on MBG/CNBD-k model. | ||
sum(cbs$xstar.mbgcnbd) | ||
|
||
cbs$xstar.mbgcnbd <- mbgcnbd.ConditionalExpectedTransactions( | ||
params = params.mbgcnbd, | ||
T.star = cbs$T.star, | ||
x = cbs$x, | ||
t.x = cbs$t.x, | ||
T.cal = cbs$T.cal) | ||
cbs$xstar.pnbd <- BTYD::pnbd.ConditionalExpectedTransactions( | ||
params = params.pnbd, | ||
T.star = cbs$T.star, | ||
x = cbs$x, | ||
t.x = cbs$t.x, | ||
T.cal = cbs$T.cal) | ||
# Estimate probabilty of being still a customer at end of calibration period. | ||
cbs$palive.mbgcnbd <- mbgcnbd.PAlive( | ||
params = params.mbgcnbd, | ||
x = cbs$x, | ||
t.x = cbs$t.x, | ||
T.cal = cbs$T.cal) | ||
|
||
#' compare forecast accuracy to Pareto/NBD | ||
(mae <- c(`MBG/CNBD-k` = mean(abs(cbs$x.star - cbs$xstar.mbgcnbd)), | ||
`Pareto/NBD` = mean(abs(cbs$x.star - cbs$xstar.pnbd)))) | ||
(lift <- 1 - mae[1]/mae[2]) | ||
#' -> 11% lift in customer-level accuracy when taking regularity into account | ||
# Estimate share of retained customers at end of calibration period. | ||
mean(cbs$palive.mbgcnbd) | ||
|
||
|
||
x <- readline("Estimate probabilty of being still alive at end of calibration (press Enter)") | ||
x <- readline("Compare log-likelihoods of various models (press Enter)") | ||
|
||
cbs$palive.mbgcnbd <- mbgcnbd.PAlive( | ||
params = params.mbgcnbd, | ||
x = cbs$x, | ||
t.x = cbs$t.x, | ||
T.cal = cbs$T.cal) | ||
cbs$palive.pnbd <- BTYD::pnbd.PAlive( | ||
params = params.pnbd, | ||
x = cbs$x, | ||
t.x = cbs$t.x, | ||
T.cal = cbs$T.cal) | ||
rbind(`Actual` = mean(cbs$alive), | ||
`MBG/CNBD-k` = mean(cbs$palive.mbgcnbd), | ||
`Pareto/NBD` = mean(cbs$palive.pnbd)) | ||
|
||
|
||
x <- readline("Compare aggregate fit in calibration to Pareto/NBD (press Enter)") | ||
params.nbd <- nbd.EstimateParameters(cbs) # estimate NBD | ||
params.pnbd <- BTYD::pnbd.EstimateParameters(cbs) # estimate Pareto/NBD | ||
params.bgnbd <- BTYD::bgnbd.EstimateParameters(cbs) # estimate BG/NBD | ||
params.mbgnbd <- mbgnbd.EstimateParameters(cbs) # estimate MBG/NBD | ||
|
||
(ll <- c(`NBD` = nbd.cbs.LL(params.nbd, cbs), | ||
`Pareto/NBD` = BTYD::pnbd.cbs.LL(params.pnbd, cbs), | ||
`BG/NBD` = BTYD::bgnbd.cbs.LL(params.bgnbd, cbs), | ||
`MBG/NBD` = mbgcnbd.cbs.LL(params.mbgnbd, cbs), | ||
`MBG/CNBD-k` = mbgcnbd.cbs.LL(params.mbgcnbd, cbs))) | ||
names(which.max(ll)) | ||
# -> MBG/CNBD-k provides best data fit according to log-likelihood | ||
|
||
|
||
x <- readline("Compare forecast accuracies of various models (press Enter)") | ||
|
||
cbs$xstar.nbd <- nbd.ConditionalExpectedTransactions( | ||
params.nbd, cbs$T.star, cbs$x, cbs$T.cal) | ||
cbs$xstar.pnbd <- BTYD::pnbd.ConditionalExpectedTransactions( | ||
params.pnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal) | ||
cbs$xstar.bgnbd <- BTYD::bgnbd.ConditionalExpectedTransactions( | ||
params.bgnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal) | ||
cbs$xstar.mbgnbd <- mbgcnbd.ConditionalExpectedTransactions( | ||
params.mbgnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal) | ||
|
||
measures <- c( | ||
"MAE" = function(a, f) mean(abs(a - f)), | ||
"MSLE" = function(a, f) mean(((log(a + 1) - log(f + 1)))^2), | ||
"BIAS" = function(a, f) sum(f)/sum(a) - 1) | ||
models <- c( | ||
"NBD" = "nbd", | ||
"Pareto/NBD" = "pnbd", | ||
"BG/NBD" = "bgnbd", | ||
"MBG/NBD" = "mbgnbd", | ||
"MBG/CNBD-k" = "mbgcnbd") | ||
|
||
sapply(measures, function(measure) { | ||
sapply(models, function(model) { | ||
err <- do.call(measure, list(a = cbs$x.star, f = cbs[[paste0("xstar.", model)]])) | ||
round(err, 3) | ||
}) | ||
}) | ||
#' -> MBG/CNBD-k provides best customer-level forecast accuracy | ||
|
||
|
||
x <- readline("Plot Frequency in Calibration (press Enter)") | ||
|
||
op <- par(mfrow = c(1, 2)) | ||
nil <- mbgcnbd.PlotFrequencyInCalibration(params.mbgcnbd, cbs, censor = 7, title = "MBG/CNBD-k") | ||
nil <- BTYD::pnbd.PlotFrequencyInCalibration(params.pnbd, cbs, censor = 7, title = "Pareto/NBD") | ||
par(op) | ||
|
||
|
||
x <- readline("Compare incremental transactions in holdout to Pareto/NBD (press Enter)") | ||
x <- readline("Plot Incremental Transactions (press Enter)") | ||
|
||
inc <- elog2inc(groceryElog) | ||
T.tot <- max(cbs$T.cal+cbs$T.star) | ||
op <- par(mfrow = c(1, 2)) | ||
inc <- elog2inc(elog, by = 1) | ||
T.tot <- max(cbs$T.cal + cbs$T.star) | ||
nil <- mbgcnbd.PlotTrackingInc(params.mbgcnbd, cbs$T.cal, T.tot, inc, title = "MBG/CNBD-k") | ||
nil <- BTYD::pnbd.PlotTrackingInc(params.pnbd, cbs$T.cal, T.tot, inc, title = "Pareto/NBD") | ||
nil <- mbgcnbd.PlotTrackingInc(params.mbgcnbd, cbs$T.cal, T.tot = T.tot, inc, title = "MBG/CNBD-k") | ||
nil <- BTYD::pnbd.PlotTrackingInc(params.pnbd, cbs$T.cal, T.tot = T.tot, inc, title = "Pareto/NBD") | ||
par(op) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,95 +1,62 @@ | ||
#' Load transaction records of 1525 grocery customers. | ||
data("groceryElog", envir = environment()) | ||
head(groceryElog) | ||
|
||
x <- readline("Estimate Pareto/GGG for for CDNow (press Enter)") | ||
|
||
#' Load subset of CDNow data | ||
#' Note: we only use a small set of customers, and a short MCMC chain to keep | ||
#' this demo finish in reasonable time. Main purpose of this code is to | ||
#' demonstrate the basic workflow of running Pareto/GGG on top of your data. | ||
cdnow <- cdnow.sample() | ||
elog <- cdnow$elog | ||
cbs <- cdnow$cbs | ||
set.seed(1234) | ||
cust.subset <- sample(cbs$cust, size = 500) | ||
elog <- elog[elog$cust %in% cust.subset, ] | ||
cbs <- cbs[cbs$cust %in% cust.subset, ] | ||
|
||
#' draw Pareto/GGG parameters and report median estimates | ||
param.draws <- pggg.mcmc.DrawParameters( | ||
cal.cbs = cbs, | ||
mcmc = 1500, burnin = 500, chains = 2, thin = 50, | ||
mc.cores = 1) | ||
round(summary(param.draws$level_2)$quantiles[, "50%"], 2) | ||
#' Convert from event log to customer-by-sufficient-statistic summary. | ||
#' Split into 52 weeks calibration, and 52 weeks holdout period. | ||
cbs <- elog2cbs(groceryElog, T.cal = "2006-12-31", T.tot = "2007-12-30") | ||
head(cbs) | ||
|
||
#' estimate distribution for future transactions | ||
xstar.draws <- mcmc.DrawFutureTransactions( | ||
cal.cbs = cbs, | ||
draws = param.draws, | ||
T.star = cbs$T.star) | ||
cbs$xstar.pggg <- apply(xstar.draws, 2, mean) | ||
|
||
#' calculate P(alive) | ||
cbs$palive <- mcmc.PAlive(param.draws) | ||
x <- readline("Estimate Pareto/NBD (HB) and Pareto/GGG (press Enter)") | ||
|
||
#' calculate P(active) | ||
cbs$pactive <- apply(xstar.draws, 2, function(x) mean(x > 0)) | ||
|
||
#mcmc.plotPActiveDiagnostic() | ||
#mcmc.PlotTrackingInc() | ||
#mcmc.PlotFrequencyInCalibration() | ||
|
||
|
||
x <- readline("Estimate Pareto/GGG and Pareto/NBD (HB) for simulated data (press Enter)") | ||
|
||
#' generate artificial Pareto/GGG data | ||
params <- list(t = 4.5, gamma = 1.5, | ||
r = 5, alpha = 30, | ||
s = 0.8, beta = 12) | ||
set.seed(1234) | ||
data <- pggg.GenerateData( | ||
n = 1000, | ||
T.cal = 21, | ||
T.star = 21, | ||
params = params, | ||
return.elog = TRUE) | ||
cbs <- data$cbs | ||
elog <- data$elog | ||
|
||
#' estimate Pareto/NBD (HB) and Pareto/GGG | ||
pnbd.draws <- pnbd.mcmc.DrawParameters(cbs, mcmc = 1500, burnin = 500, chains = 2, thin = 50, mc.cores = 1) | ||
pggg.draws <- pggg.mcmc.DrawParameters(cbs, mcmc = 1500, burnin = 500, chains = 2, thin = 50, mc.cores = 1) | ||
rbind(`Actual` = params, | ||
`Pareto/GGG` = summary(pggg.draws$level_2)$quantiles[, "50%"], | ||
`Pareto/NBD` = c(t=NA, gamma=NA, summary(pnbd.draws$level_2)$quantiles[, "50%"])) | ||
#' Draw Pareto/NBD parameters and compare median estimates with Pareto/NBD | ||
#' implementation from BTYD package. | ||
pnbd.draws <- pnbd.mcmc.DrawParameters(cal.cbs = cbs, mc.cores = 1) | ||
|
||
#' Draw Pareto/GGG parameters and report median estimates. Note, that we only | ||
#' use a relatively short MCMC chain to keep this demo finish reasonably fast. | ||
pggg.draws <- pggg.mcmc.DrawParameters(cal.cbs = cbs, | ||
mcmc = 500, burnin = 500, chains = 2, thin = 20, | ||
mc.cores = 1) | ||
|
||
round(rbind(`Pareto/GGG`= summary(pggg.draws$level_2)$quantiles[, "50%"], | ||
`Pareto/NBD (HB)` = c(NA, NA, summary(pnbd.draws$level_2)$quantiles[, "50%"]), | ||
`Pareto/NBD` = c(NA, NA, BTYD::pnbd.EstimateParameters(cbs))), 2) | ||
|
||
#' plot estimated parameter distributions | ||
plot(param.draws$level_2, density = TRUE, trace = FALSE) | ||
plot(pggg.draws$level_2, density = TRUE, trace = FALSE) | ||
pggg.plotRegularityRateHeterogeneity(pggg.draws) | ||
# -> regularity detected in grocery dataset | ||
|
||
#' check MCMC convergence | ||
plot(param.draws$level_2, density = FALSE, trace = TRUE) | ||
plot(pggg.draws$level_2, density = FALSE, trace = TRUE) | ||
coda::effectiveSize(pggg.draws$level_2) | ||
|
||
|
||
x <- readline("Estimate future transactions (press Enter)") | ||
|
||
#' draw future transaction | ||
pnbd.xstar.draws <- mcmc.DrawFutureTransactions(cbs, pnbd.draws, T.star = cbs$T.star, sample_size = 400) | ||
pggg.xstar.draws <- mcmc.DrawFutureTransactions(cbs, pggg.draws, T.star = cbs$T.star, sample_size = 400) | ||
xstar.pnbd.draws <- mcmc.DrawFutureTransactions(cbs, pnbd.draws, T.star = cbs$T.star, sample_size = 400) | ||
xstar.pggg.draws <- mcmc.DrawFutureTransactions(cbs, pggg.draws, T.star = cbs$T.star, sample_size = 400) | ||
|
||
#' calculate mean over future transaction draws for each customer | ||
cbs$xstar.pnbd <- apply(pnbd.xstar.draws, 2, mean) | ||
cbs$xstar.pggg <- apply(pggg.xstar.draws, 2, mean) | ||
|
||
#' calculate P(alive) | ||
cbs$palive.pnbd <- mcmc.PAlive(pnbd.draws) | ||
cbs$palive.pggg <- mcmc.PAlive(pggg.draws) | ||
|
||
#' calculate P(active) | ||
cbs$pactive.pnbd <- mcmc.PActive(xstar.pnbd.draws) | ||
cbs$pactive.pggg <- mcmc.PActive(xstar.pggg.draws) | ||
|
||
#' compare forecast accuracy to Pareto/NBD | ||
(mae <- c(`Pareto/GGG` = mean(abs(cbs$x.star - cbs$xstar.pggg)), | ||
`Pareto/NBD` = mean(abs(cbs$x.star - cbs$xstar.pnbd)))) | ||
(lift <- 1 - mae[1]/mae[2]) | ||
#' -> 18% lift in customer-level accuracy when taking regularity into account | ||
#' -> 9% lift in customer-level accuracy when taking regularity into account | ||
|
||
# P(active) diagnostic plot | ||
nil <- mcmc.plotPActiveDiagnostic(cbs, pggg.xstar.draws) | ||
|
||
#' Compare incremental transactions in holdout to Pareto/NBD | ||
inc <- elog2inc(elog, by = 1) | ||
T.tot <- max(cbs$T.cal + cbs$T.star) | ||
op <- par(mfrow = c(1, 2)) | ||
nil <- mcmc.PlotTrackingInc(pggg.draws, cbs$T.cal, T.tot, inc, title = "Pareto/GGG", xlab="Days") | ||
nil <- mcmc.PlotTrackingInc(pnbd.draws, cbs$T.cal, T.tot, inc, title = "Pareto/NBD", xlab="Days") | ||
par(op) |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.