Skip to content

Commit

Permalink
add grocery data
Browse files Browse the repository at this point in the history
+ added `groceryElog` dataset
+ use `groceryElog` dataset for demos that can leverage regularity
fixes #38
  • Loading branch information
mplatzer committed Sep 21, 2016
1 parent fbf7a6d commit 6bcf044
Show file tree
Hide file tree
Showing 9 changed files with 179 additions and 143 deletions.
2 changes: 2 additions & 0 deletions NEWS
@@ -1,6 +1,8 @@
0.8.3.9000
- added AppVeyor for automated build tests on Windows
- bugfix for `elog2inc` and `elog2cum` for elogs where dates are missing
- added `groceryElog` dataset
- use `groceryElog` dataset for demos that can leverage regularity

0.8.3
- rewrote demos
Expand Down
18 changes: 18 additions & 0 deletions R/data.R
@@ -0,0 +1,18 @@
#' Event log for customers of an online grocery store.
#'
#' These data came from an online retailer offering a broad range of grocery
#' categories. The original data set spans four years, but lacked the customers'
#' acquisition date. Therefore, we constructed a quasi cohort by limiting the
#' provided data analysis to those customers who haven't purchased at all in the
#' first two years, and had their first purchase in the first quarter of 2006.
#' This resulted in 10483 transactions being recorded for 1525 customers during
#' a period of two years (2006-2007).
#'
#' @references Platzer, Michael, and Thomas Reutterer. 'Ticking Away the
#' Moments: Timing Regularity Helps to Better Predict Customer Activity.'
#' Marketing Science (2016).
#'
#' @format A data frame with 10483 rows and 2 variables: \describe{
#' \item{cust}{customer ID, factor vector} \item{date}{transaction date,
#' Date vector} }
"groceryElog"
8 changes: 4 additions & 4 deletions README.md
Expand Up @@ -21,10 +21,10 @@ library(BTYDplus)
## Getting Started

```
demo("cdnow") # Demonstration of fitting various models to the CDNow data set
demo("mbg-cnbd-k") # Demonstration of MBG/CNBD-k model
demo("pareto-abe") # Demonstration of Abe's Pareto/NBD variant
demo("pareto-ggg") # Demonstration of Pareto/NBD (HB) & Pareto/GGG model
demo("cdnow") # Demonstration of fitting various models to the CDNow dataset
demo("mbg-cnbd-k") # Demonstration of MBG/CNBD-k model with grocery dataset
demo("pareto-abe") # Demonstration of Abe's Pareto/NBD variant with CDNow dataset
demo("pareto-ggg") # Demonstration of Pareto/NBD (HB) & Pareto/GGG model with grocery dataset
```

## Contributions
Expand Down
Binary file added data/groceryElog.rda
Binary file not shown.
8 changes: 4 additions & 4 deletions demo/00Index
@@ -1,4 +1,4 @@
cdnow Demonstration of fitting various models to the CDNow data set
mbg-cnbd-k Demonstration of MBG/CNBD-k model
pareto-abe Demonstration of Abe's Pareto/NBD variant
pareto-ggg Demonstration of Pareto/NBD (HB) & Pareto/GGG model
cdnow Demonstration of fitting various models to the CDNow dataset
mbg-cnbd-k Demonstration of MBG/CNBD-k model with grocery dataset
pareto-abe Demonstration of Abe's Pareto/NBD variant with CDNow dataset
pareto-ggg Demonstration of Pareto/NBD (HB) & Pareto/GGG model with grocery dataset
4 changes: 2 additions & 2 deletions demo/cdnow.R
Expand Up @@ -136,6 +136,6 @@ c("Estimated Sales" = sum(cbs$sales.mbgcnbd),
"Actual Sales" = sum(cbs$sales.star))


x <- readline("For a demo of Pareto/GGG and Pareto/NBD (HB) see `demo(\"pareto-ggg\")`.")

x <- readline("For a demo of Pareto/NBD (Abe) see `demo(\"pareto-abe\")`.")

x <- readline("For a demo of Pareto/GGG and Pareto/NBD (HB) see `demo(\"pareto-ggg\")`.")
145 changes: 83 additions & 62 deletions demo/mbg-cnbd-k.R
@@ -1,89 +1,110 @@
#' Load transaction records of 1525 grocery customers.
data("groceryElog", envir = environment())
head(groceryElog)

#' Simulate artificial MBG/CNBD-3 data.
set.seed(1234)
n <- 4000 # number of customers
T.cal <- round(runif(n, 24, 32)/4)*4 # 24-32 weeks of calibration period
T.star <- 32 # 32 weeks of hold-out period
params <- c(k = 3, # regularity in interpurchase-times (Erlang-k)
r = 0.85, alpha = 1.45, # purchase frequency lambda_i ~ Gamma(r, alpha)
a = 0.79, b = 2.42) # dropout probability p_i ~ Beta(a, b)
data <- mbgcnbd.GenerateData(n, T.cal, T.star, params, return.elog = TRUE)

cbs <- data$cbs # CBS summary - one record per customer
#' Convert from event log to customer-by-sufficient-statistic summary.
#' Split into 52 weeks calibration, and 52 weeks holdout period.
cbs <- elog2cbs(groceryElog, T.cal = "2006-12-31", T.tot = "2007-12-30")
head(cbs)

elog <- data$elog # Event log - one row per event/purchase
head(elog)


x <- readline("Estimate regularity via Wheat/Morrison estimator (press Enter)")

(k.est <- estimateRegularity(elog))
#' -> Wheat-Morrison estimator correctly detects Erlang-3.
(k.est <- estimateRegularity(groceryElog))
#' -> Wheat-Morrison estimator detects Erlang-2.

#' Plot Timing Patterns of a few sampled customers
plotSampledTimingPatterns(groceryElog, T.cal = "2006-12-31")


x <- readline("Estimate MBG/CNBD-k model (press Enter)")

params.mbgcnbd <- mbgcnbd.EstimateParameters(cbs)
params.pnbd <- BTYD::pnbd.EstimateParameters(cbs)
rbind(`Actual` = params,
`MBG/CNBD-k` = round(params.mbgcnbd, 2),
`Pareto/NBD` = c(1, round(params.pnbd, 2)))
#' -> Underlying parameters are successfully recovered.
# Estimate MBG/CNBD-k model parameters.
(params.mbgcnbd <- mbgcnbd.EstimateParameters(cbs))
#' k=2 -> regularity also detected via MBG/CNBD-k model

# Predict transactions at customer level with MBG/CNBD-k model.
cbs$xstar.mbgcnbd <- mbgcnbd.ConditionalExpectedTransactions(
params = params.mbgcnbd,
T.star = cbs$T.star,
x = cbs$x,
t.x = cbs$t.x,
T.cal = cbs$T.cal)

x <- readline("Estimate transactions during holdout period (press Enter)")
# Estimate total transactions during holdout, based on MBG/CNBD-k model.
sum(cbs$xstar.mbgcnbd)

cbs$xstar.mbgcnbd <- mbgcnbd.ConditionalExpectedTransactions(
params = params.mbgcnbd,
T.star = cbs$T.star,
x = cbs$x,
t.x = cbs$t.x,
T.cal = cbs$T.cal)
cbs$xstar.pnbd <- BTYD::pnbd.ConditionalExpectedTransactions(
params = params.pnbd,
T.star = cbs$T.star,
x = cbs$x,
t.x = cbs$t.x,
T.cal = cbs$T.cal)
# Estimate probabilty of being still a customer at end of calibration period.
cbs$palive.mbgcnbd <- mbgcnbd.PAlive(
params = params.mbgcnbd,
x = cbs$x,
t.x = cbs$t.x,
T.cal = cbs$T.cal)

#' compare forecast accuracy to Pareto/NBD
(mae <- c(`MBG/CNBD-k` = mean(abs(cbs$x.star - cbs$xstar.mbgcnbd)),
`Pareto/NBD` = mean(abs(cbs$x.star - cbs$xstar.pnbd))))
(lift <- 1 - mae[1]/mae[2])
#' -> 11% lift in customer-level accuracy when taking regularity into account
# Estimate share of retained customers at end of calibration period.
mean(cbs$palive.mbgcnbd)


x <- readline("Estimate probabilty of being still alive at end of calibration (press Enter)")
x <- readline("Compare log-likelihoods of various models (press Enter)")

cbs$palive.mbgcnbd <- mbgcnbd.PAlive(
params = params.mbgcnbd,
x = cbs$x,
t.x = cbs$t.x,
T.cal = cbs$T.cal)
cbs$palive.pnbd <- BTYD::pnbd.PAlive(
params = params.pnbd,
x = cbs$x,
t.x = cbs$t.x,
T.cal = cbs$T.cal)
rbind(`Actual` = mean(cbs$alive),
`MBG/CNBD-k` = mean(cbs$palive.mbgcnbd),
`Pareto/NBD` = mean(cbs$palive.pnbd))


x <- readline("Compare aggregate fit in calibration to Pareto/NBD (press Enter)")
params.nbd <- nbd.EstimateParameters(cbs) # estimate NBD
params.pnbd <- BTYD::pnbd.EstimateParameters(cbs) # estimate Pareto/NBD
params.bgnbd <- BTYD::bgnbd.EstimateParameters(cbs) # estimate BG/NBD
params.mbgnbd <- mbgnbd.EstimateParameters(cbs) # estimate MBG/NBD

(ll <- c(`NBD` = nbd.cbs.LL(params.nbd, cbs),
`Pareto/NBD` = BTYD::pnbd.cbs.LL(params.pnbd, cbs),
`BG/NBD` = BTYD::bgnbd.cbs.LL(params.bgnbd, cbs),
`MBG/NBD` = mbgcnbd.cbs.LL(params.mbgnbd, cbs),
`MBG/CNBD-k` = mbgcnbd.cbs.LL(params.mbgcnbd, cbs)))
names(which.max(ll))
# -> MBG/CNBD-k provides best data fit according to log-likelihood


x <- readline("Compare forecast accuracies of various models (press Enter)")

cbs$xstar.nbd <- nbd.ConditionalExpectedTransactions(
params.nbd, cbs$T.star, cbs$x, cbs$T.cal)
cbs$xstar.pnbd <- BTYD::pnbd.ConditionalExpectedTransactions(
params.pnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal)
cbs$xstar.bgnbd <- BTYD::bgnbd.ConditionalExpectedTransactions(
params.bgnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal)
cbs$xstar.mbgnbd <- mbgcnbd.ConditionalExpectedTransactions(
params.mbgnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal)

measures <- c(
"MAE" = function(a, f) mean(abs(a - f)),
"MSLE" = function(a, f) mean(((log(a + 1) - log(f + 1)))^2),
"BIAS" = function(a, f) sum(f)/sum(a) - 1)
models <- c(
"NBD" = "nbd",
"Pareto/NBD" = "pnbd",
"BG/NBD" = "bgnbd",
"MBG/NBD" = "mbgnbd",
"MBG/CNBD-k" = "mbgcnbd")

sapply(measures, function(measure) {
sapply(models, function(model) {
err <- do.call(measure, list(a = cbs$x.star, f = cbs[[paste0("xstar.", model)]]))
round(err, 3)
})
})
#' -> MBG/CNBD-k provides best customer-level forecast accuracy


x <- readline("Plot Frequency in Calibration (press Enter)")

op <- par(mfrow = c(1, 2))
nil <- mbgcnbd.PlotFrequencyInCalibration(params.mbgcnbd, cbs, censor = 7, title = "MBG/CNBD-k")
nil <- BTYD::pnbd.PlotFrequencyInCalibration(params.pnbd, cbs, censor = 7, title = "Pareto/NBD")
par(op)


x <- readline("Compare incremental transactions in holdout to Pareto/NBD (press Enter)")
x <- readline("Plot Incremental Transactions (press Enter)")

inc <- elog2inc(groceryElog)
T.tot <- max(cbs$T.cal+cbs$T.star)
op <- par(mfrow = c(1, 2))
inc <- elog2inc(elog, by = 1)
T.tot <- max(cbs$T.cal + cbs$T.star)
nil <- mbgcnbd.PlotTrackingInc(params.mbgcnbd, cbs$T.cal, T.tot, inc, title = "MBG/CNBD-k")
nil <- BTYD::pnbd.PlotTrackingInc(params.pnbd, cbs$T.cal, T.tot, inc, title = "Pareto/NBD")
nil <- mbgcnbd.PlotTrackingInc(params.mbgcnbd, cbs$T.cal, T.tot = T.tot, inc, title = "MBG/CNBD-k")
nil <- BTYD::pnbd.PlotTrackingInc(params.pnbd, cbs$T.cal, T.tot = T.tot, inc, title = "Pareto/NBD")
par(op)
109 changes: 38 additions & 71 deletions demo/pareto-ggg.R
@@ -1,95 +1,62 @@
#' Load transaction records of 1525 grocery customers.
data("groceryElog", envir = environment())
head(groceryElog)

x <- readline("Estimate Pareto/GGG for for CDNow (press Enter)")

#' Load subset of CDNow data
#' Note: we only use a small set of customers, and a short MCMC chain to keep
#' this demo finish in reasonable time. Main purpose of this code is to
#' demonstrate the basic workflow of running Pareto/GGG on top of your data.
cdnow <- cdnow.sample()
elog <- cdnow$elog
cbs <- cdnow$cbs
set.seed(1234)
cust.subset <- sample(cbs$cust, size = 500)
elog <- elog[elog$cust %in% cust.subset, ]
cbs <- cbs[cbs$cust %in% cust.subset, ]

#' draw Pareto/GGG parameters and report median estimates
param.draws <- pggg.mcmc.DrawParameters(
cal.cbs = cbs,
mcmc = 1500, burnin = 500, chains = 2, thin = 50,
mc.cores = 1)
round(summary(param.draws$level_2)$quantiles[, "50%"], 2)
#' Convert from event log to customer-by-sufficient-statistic summary.
#' Split into 52 weeks calibration, and 52 weeks holdout period.
cbs <- elog2cbs(groceryElog, T.cal = "2006-12-31", T.tot = "2007-12-30")
head(cbs)

#' estimate distribution for future transactions
xstar.draws <- mcmc.DrawFutureTransactions(
cal.cbs = cbs,
draws = param.draws,
T.star = cbs$T.star)
cbs$xstar.pggg <- apply(xstar.draws, 2, mean)

#' calculate P(alive)
cbs$palive <- mcmc.PAlive(param.draws)
x <- readline("Estimate Pareto/NBD (HB) and Pareto/GGG (press Enter)")

#' calculate P(active)
cbs$pactive <- apply(xstar.draws, 2, function(x) mean(x > 0))

#mcmc.plotPActiveDiagnostic()
#mcmc.PlotTrackingInc()
#mcmc.PlotFrequencyInCalibration()


x <- readline("Estimate Pareto/GGG and Pareto/NBD (HB) for simulated data (press Enter)")

#' generate artificial Pareto/GGG data
params <- list(t = 4.5, gamma = 1.5,
r = 5, alpha = 30,
s = 0.8, beta = 12)
set.seed(1234)
data <- pggg.GenerateData(
n = 1000,
T.cal = 21,
T.star = 21,
params = params,
return.elog = TRUE)
cbs <- data$cbs
elog <- data$elog

#' estimate Pareto/NBD (HB) and Pareto/GGG
pnbd.draws <- pnbd.mcmc.DrawParameters(cbs, mcmc = 1500, burnin = 500, chains = 2, thin = 50, mc.cores = 1)
pggg.draws <- pggg.mcmc.DrawParameters(cbs, mcmc = 1500, burnin = 500, chains = 2, thin = 50, mc.cores = 1)
rbind(`Actual` = params,
`Pareto/GGG` = summary(pggg.draws$level_2)$quantiles[, "50%"],
`Pareto/NBD` = c(t=NA, gamma=NA, summary(pnbd.draws$level_2)$quantiles[, "50%"]))
#' Draw Pareto/NBD parameters and compare median estimates with Pareto/NBD
#' implementation from BTYD package.
pnbd.draws <- pnbd.mcmc.DrawParameters(cal.cbs = cbs, mc.cores = 1)

#' Draw Pareto/GGG parameters and report median estimates. Note, that we only
#' use a relatively short MCMC chain to keep this demo finish reasonably fast.
pggg.draws <- pggg.mcmc.DrawParameters(cal.cbs = cbs,
mcmc = 500, burnin = 500, chains = 2, thin = 20,
mc.cores = 1)

round(rbind(`Pareto/GGG`= summary(pggg.draws$level_2)$quantiles[, "50%"],
`Pareto/NBD (HB)` = c(NA, NA, summary(pnbd.draws$level_2)$quantiles[, "50%"]),
`Pareto/NBD` = c(NA, NA, BTYD::pnbd.EstimateParameters(cbs))), 2)

#' plot estimated parameter distributions
plot(param.draws$level_2, density = TRUE, trace = FALSE)
plot(pggg.draws$level_2, density = TRUE, trace = FALSE)
pggg.plotRegularityRateHeterogeneity(pggg.draws)
# -> regularity detected in grocery dataset

#' check MCMC convergence
plot(param.draws$level_2, density = FALSE, trace = TRUE)
plot(pggg.draws$level_2, density = FALSE, trace = TRUE)
coda::effectiveSize(pggg.draws$level_2)


x <- readline("Estimate future transactions (press Enter)")

#' draw future transaction
pnbd.xstar.draws <- mcmc.DrawFutureTransactions(cbs, pnbd.draws, T.star = cbs$T.star, sample_size = 400)
pggg.xstar.draws <- mcmc.DrawFutureTransactions(cbs, pggg.draws, T.star = cbs$T.star, sample_size = 400)
xstar.pnbd.draws <- mcmc.DrawFutureTransactions(cbs, pnbd.draws, T.star = cbs$T.star, sample_size = 400)
xstar.pggg.draws <- mcmc.DrawFutureTransactions(cbs, pggg.draws, T.star = cbs$T.star, sample_size = 400)

#' calculate mean over future transaction draws for each customer
cbs$xstar.pnbd <- apply(pnbd.xstar.draws, 2, mean)
cbs$xstar.pggg <- apply(pggg.xstar.draws, 2, mean)

#' calculate P(alive)
cbs$palive.pnbd <- mcmc.PAlive(pnbd.draws)
cbs$palive.pggg <- mcmc.PAlive(pggg.draws)

#' calculate P(active)
cbs$pactive.pnbd <- mcmc.PActive(xstar.pnbd.draws)
cbs$pactive.pggg <- mcmc.PActive(xstar.pggg.draws)

#' compare forecast accuracy to Pareto/NBD
(mae <- c(`Pareto/GGG` = mean(abs(cbs$x.star - cbs$xstar.pggg)),
`Pareto/NBD` = mean(abs(cbs$x.star - cbs$xstar.pnbd))))
(lift <- 1 - mae[1]/mae[2])
#' -> 18% lift in customer-level accuracy when taking regularity into account
#' -> 9% lift in customer-level accuracy when taking regularity into account

# P(active) diagnostic plot
nil <- mcmc.plotPActiveDiagnostic(cbs, pggg.xstar.draws)

#' Compare incremental transactions in holdout to Pareto/NBD
inc <- elog2inc(elog, by = 1)
T.tot <- max(cbs$T.cal + cbs$T.star)
op <- par(mfrow = c(1, 2))
nil <- mcmc.PlotTrackingInc(pggg.draws, cbs$T.cal, T.tot, inc, title = "Pareto/GGG", xlab="Days")
nil <- mcmc.PlotTrackingInc(pnbd.draws, cbs$T.cal, T.tot, inc, title = "Pareto/NBD", xlab="Days")
par(op)
28 changes: 28 additions & 0 deletions man/groceryElog.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 6bcf044

Please sign in to comment.