add grocery data

+ added `groceryElog` dataset + use `groceryElog` dataset for demos that can leverage regularity fixes #38
mplatzer · Sep 21, 2016 · 6bcf044 · 6bcf044
1 parent fbf7a6d
commit 6bcf044
Show file tree

Hide file tree

Showing 9 changed files with 179 additions and 143 deletions.
diff --git a/NEWS b/NEWS
@@ -1,6 +1,8 @@
 0.8.3.9000
 - added AppVeyor for automated build tests on Windows
 - bugfix for `elog2inc` and `elog2cum` for elogs where dates are missing
+- added `groceryElog` dataset
+- use `groceryElog` dataset for demos that can leverage regularity
 
 0.8.3
 - rewrote demos

diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,18 @@
+#' Event log for customers of an online grocery store.
+#' 
+#' These data came from an online retailer offering a broad range of grocery 
+#' categories. The original data set spans four years, but lacked the customers'
+#' acquisition date. Therefore, we constructed a quasi cohort by limiting the 
+#' provided data analysis to those customers who haven't purchased at all in the
+#' first two years, and had their first purchase in the first quarter of 2006. 
+#' This resulted in 10483 transactions being recorded for 1525 customers during 
+#' a period of two years (2006-2007).
+#' 
+#' @references Platzer, Michael, and Thomas Reutterer. 'Ticking Away the 
+#'   Moments: Timing Regularity Helps to Better Predict Customer Activity.' 
+#'   Marketing Science (2016).
+#'   
+#' @format A data frame with 10483 rows and 2 variables: \describe{ 
+#'   \item{cust}{customer ID, factor vector} \item{date}{transaction date, 
+#'   Date vector} }
+"groceryElog"
diff --git a/README.md b/README.md
@@ -21,10 +21,10 @@ library(BTYDplus)
 ## Getting Started
 
 ```
-demo("cdnow")          # Demonstration of fitting various models to the CDNow data set
-demo("mbg-cnbd-k")     # Demonstration of MBG/CNBD-k model
-demo("pareto-abe")     # Demonstration of Abe's Pareto/NBD variant
-demo("pareto-ggg")     # Demonstration of Pareto/NBD (HB) & Pareto/GGG model
+demo("cdnow")        # Demonstration of fitting various models to the CDNow dataset
+demo("mbg-cnbd-k")   # Demonstration of MBG/CNBD-k model with grocery dataset
+demo("pareto-abe")   # Demonstration of Abe's Pareto/NBD variant with CDNow dataset
+demo("pareto-ggg")   # Demonstration of Pareto/NBD (HB) & Pareto/GGG model with grocery dataset
 ```
 
 ## Contributions

diff --git a/data/groceryElog.rda b/data/groceryElog.rda
diff --git a/demo/00Index b/demo/00Index
@@ -1,4 +1,4 @@
-cdnow          Demonstration of fitting various models to the CDNow data set
-mbg-cnbd-k     Demonstration of MBG/CNBD-k model
-pareto-abe     Demonstration of Abe's Pareto/NBD variant
-pareto-ggg     Demonstration of Pareto/NBD (HB) & Pareto/GGG model
+cdnow          Demonstration of fitting various models to the CDNow dataset
+mbg-cnbd-k     Demonstration of MBG/CNBD-k model with grocery dataset
+pareto-abe     Demonstration of Abe's Pareto/NBD variant with CDNow dataset
+pareto-ggg     Demonstration of Pareto/NBD (HB) & Pareto/GGG model with grocery dataset
diff --git a/demo/cdnow.R b/demo/cdnow.R
@@ -136,6 +136,6 @@ c("Estimated Sales" = sum(cbs$sales.mbgcnbd),
   "Actual Sales"    = sum(cbs$sales.star))
 
 
-x <- readline("For a demo of Pareto/GGG and Pareto/NBD (HB) see `demo(\"pareto-ggg\")`.")
-
 x <- readline("For a demo of Pareto/NBD (Abe) see `demo(\"pareto-abe\")`.")
+
+x <- readline("For a demo of Pareto/GGG and Pareto/NBD (HB) see `demo(\"pareto-ggg\")`.")
diff --git a/demo/mbg-cnbd-k.R b/demo/mbg-cnbd-k.R
@@ -1,89 +1,110 @@
+#' Load transaction records of 1525 grocery customers.
+data("groceryElog", envir = environment())
+head(groceryElog)
 
-#' Simulate artificial MBG/CNBD-3 data.
-set.seed(1234)
-n      <- 4000                         # number of customers
-T.cal  <- round(runif(n, 24, 32)/4)*4  # 24-32 weeks of calibration period
-T.star <- 32                           # 32 weeks of hold-out period
-params <- c(k = 3,                  # regularity in interpurchase-times (Erlang-k)
-            r = 0.85, alpha = 1.45, # purchase frequency lambda_i ~ Gamma(r, alpha) 
-            a = 0.79, b = 2.42)     # dropout probability p_i ~ Beta(a, b)
-data <- mbgcnbd.GenerateData(n, T.cal, T.star, params, return.elog = TRUE)
-
-cbs  <- data$cbs   # CBS summary - one record per customer
+#' Convert from event log to customer-by-sufficient-statistic summary.
+#' Split into 52 weeks calibration, and 52 weeks holdout period.
+cbs <- elog2cbs(groceryElog, T.cal = "2006-12-31", T.tot = "2007-12-30")
 head(cbs)
 
-elog <- data$elog  # Event log - one row per event/purchase
-head(elog)
-
 
 x <- readline("Estimate regularity via Wheat/Morrison estimator (press Enter)")
 
-(k.est <- estimateRegularity(elog))
-#' -> Wheat-Morrison estimator correctly detects Erlang-3.
+(k.est <- estimateRegularity(groceryElog))
+#' -> Wheat-Morrison estimator detects Erlang-2.
+
+#' Plot Timing Patterns of a few sampled customers
+plotSampledTimingPatterns(groceryElog, T.cal = "2006-12-31")
 
 
 x <- readline("Estimate MBG/CNBD-k model (press Enter)")
 
-params.mbgcnbd <- mbgcnbd.EstimateParameters(cbs)
-params.pnbd    <- BTYD::pnbd.EstimateParameters(cbs)
-rbind(`Actual`     = params, 
-      `MBG/CNBD-k` = round(params.mbgcnbd, 2), 
-      `Pareto/NBD` = c(1, round(params.pnbd, 2)))
-#' -> Underlying parameters are successfully recovered.
+# Estimate MBG/CNBD-k model parameters.
+(params.mbgcnbd <- mbgcnbd.EstimateParameters(cbs))
+#' k=2 -> regularity also detected via MBG/CNBD-k model
 
+# Predict transactions at customer level with MBG/CNBD-k model.
+cbs$xstar.mbgcnbd <- mbgcnbd.ConditionalExpectedTransactions(
+  params = params.mbgcnbd,
+  T.star = cbs$T.star, 
+  x      = cbs$x, 
+  t.x    = cbs$t.x, 
+  T.cal  = cbs$T.cal)
 
-x <- readline("Estimate transactions during holdout period (press Enter)")
+# Estimate total transactions during holdout, based on MBG/CNBD-k model.
+sum(cbs$xstar.mbgcnbd)
 
-cbs$xstar.mbgcnbd <- mbgcnbd.ConditionalExpectedTransactions(
-                 params = params.mbgcnbd, 
-                 T.star = cbs$T.star, 
-                 x      = cbs$x, 
-                 t.x    = cbs$t.x, 
-                 T.cal  = cbs$T.cal)
-cbs$xstar.pnbd <- BTYD::pnbd.ConditionalExpectedTransactions(
-                 params = params.pnbd, 
-                 T.star = cbs$T.star, 
-                 x      = cbs$x, 
-                 t.x    = cbs$t.x, 
-                 T.cal  = cbs$T.cal)
+# Estimate probabilty of being still a customer at end of calibration period.
+cbs$palive.mbgcnbd <- mbgcnbd.PAlive(
+  params = params.mbgcnbd,
+  x      = cbs$x,
+  t.x    = cbs$t.x,
+  T.cal  = cbs$T.cal)
 
-#' compare forecast accuracy to Pareto/NBD
-(mae <- c(`MBG/CNBD-k` = mean(abs(cbs$x.star - cbs$xstar.mbgcnbd)), 
-         `Pareto/NBD` = mean(abs(cbs$x.star - cbs$xstar.pnbd))))
-(lift <- 1 - mae[1]/mae[2])
-#' -> 11% lift in customer-level accuracy when taking regularity into account
+# Estimate share of retained customers at end of calibration period.
+mean(cbs$palive.mbgcnbd)
 
 
-x <- readline("Estimate probabilty of being still alive at end of calibration (press Enter)")
+x <- readline("Compare log-likelihoods of various models (press Enter)")
 
-cbs$palive.mbgcnbd <- mbgcnbd.PAlive(
-                  params = params.mbgcnbd, 
-                  x      = cbs$x, 
-                  t.x    = cbs$t.x, 
-                  T.cal  = cbs$T.cal)
-cbs$palive.pnbd <- BTYD::pnbd.PAlive(
-                  params = params.pnbd, 
-                  x      = cbs$x, 
-                  t.x    = cbs$t.x, 
-                  T.cal  = cbs$T.cal)
-rbind(`Actual` = mean(cbs$alive),
-      `MBG/CNBD-k` = mean(cbs$palive.mbgcnbd),
-      `Pareto/NBD` = mean(cbs$palive.pnbd))
-
-
-x <- readline("Compare aggregate fit in calibration to Pareto/NBD (press Enter)")
+params.nbd <- nbd.EstimateParameters(cbs)            # estimate NBD
+params.pnbd <- BTYD::pnbd.EstimateParameters(cbs)    # estimate Pareto/NBD
+params.bgnbd <- BTYD::bgnbd.EstimateParameters(cbs)  # estimate BG/NBD
+params.mbgnbd <- mbgnbd.EstimateParameters(cbs)      # estimate MBG/NBD
+
+(ll <- c(`NBD`        = nbd.cbs.LL(params.nbd, cbs),
+         `Pareto/NBD` = BTYD::pnbd.cbs.LL(params.pnbd, cbs),
+         `BG/NBD`     = BTYD::bgnbd.cbs.LL(params.bgnbd, cbs), 
+         `MBG/NBD`    = mbgcnbd.cbs.LL(params.mbgnbd, cbs), 
+         `MBG/CNBD-k` = mbgcnbd.cbs.LL(params.mbgcnbd, cbs)))
+names(which.max(ll))
+# -> MBG/CNBD-k provides best data fit according to log-likelihood
+
+
+x <- readline("Compare forecast accuracies of various models (press Enter)")
+
+cbs$xstar.nbd <- nbd.ConditionalExpectedTransactions(
+  params.nbd, cbs$T.star, cbs$x, cbs$T.cal)
+cbs$xstar.pnbd <- BTYD::pnbd.ConditionalExpectedTransactions(
+  params.pnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal)
+cbs$xstar.bgnbd <- BTYD::bgnbd.ConditionalExpectedTransactions(
+  params.bgnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal)
+cbs$xstar.mbgnbd <- mbgcnbd.ConditionalExpectedTransactions(
+  params.mbgnbd, cbs$T.star, cbs$x, cbs$t.x, cbs$T.cal)
+
+measures <- c(
+  "MAE" = function(a, f) mean(abs(a - f)),
+  "MSLE" = function(a, f) mean(((log(a + 1) - log(f + 1)))^2),
+  "BIAS" = function(a, f) sum(f)/sum(a) - 1)
+models <- c(
+  "NBD" = "nbd",
+  "Pareto/NBD" = "pnbd",
+  "BG/NBD"     = "bgnbd",
+  "MBG/NBD"    = "mbgnbd",
+  "MBG/CNBD-k" = "mbgcnbd")
+
+sapply(measures, function(measure) {
+  sapply(models, function(model) {
+    err <- do.call(measure, list(a = cbs$x.star, f = cbs[[paste0("xstar.", model)]]))
+    round(err, 3)
+  })
+})
+#' -> MBG/CNBD-k provides best customer-level forecast accuracy
+
+
+x <- readline("Plot Frequency in Calibration (press Enter)")
 
 op <- par(mfrow = c(1, 2))
 nil <- mbgcnbd.PlotFrequencyInCalibration(params.mbgcnbd, cbs, censor = 7, title = "MBG/CNBD-k")
 nil <- BTYD::pnbd.PlotFrequencyInCalibration(params.pnbd, cbs, censor = 7, title = "Pareto/NBD")
 par(op)
 
 
-x <- readline("Compare incremental transactions in holdout to Pareto/NBD (press Enter)")
+x <- readline("Plot Incremental Transactions (press Enter)")
 
+inc <- elog2inc(groceryElog)
+T.tot <- max(cbs$T.cal+cbs$T.star)
 op <- par(mfrow = c(1, 2))
-inc <- elog2inc(elog, by = 1)
-T.tot <- max(cbs$T.cal + cbs$T.star)
-nil <- mbgcnbd.PlotTrackingInc(params.mbgcnbd, cbs$T.cal, T.tot, inc, title = "MBG/CNBD-k")
-nil <- BTYD::pnbd.PlotTrackingInc(params.pnbd, cbs$T.cal, T.tot, inc, title = "Pareto/NBD")
+nil <- mbgcnbd.PlotTrackingInc(params.mbgcnbd, cbs$T.cal, T.tot = T.tot, inc, title = "MBG/CNBD-k")
+nil <- BTYD::pnbd.PlotTrackingInc(params.pnbd, cbs$T.cal, T.tot = T.tot, inc, title = "Pareto/NBD")
 par(op)
diff --git a/demo/pareto-ggg.R b/demo/pareto-ggg.R
@@ -1,95 +1,62 @@
+#' Load transaction records of 1525 grocery customers.
+data("groceryElog", envir = environment())
+head(groceryElog)
 
-x <- readline("Estimate Pareto/GGG for for CDNow (press Enter)")
-
-#' Load subset of CDNow data
-#' Note: we only use a small set of customers, and a short MCMC chain to keep 
-#' this demo finish in reasonable time. Main purpose of this code is to
-#' demonstrate the basic workflow of running Pareto/GGG on top of your data.
-cdnow <- cdnow.sample()
-elog  <- cdnow$elog
-cbs   <- cdnow$cbs
-set.seed(1234)
-cust.subset <- sample(cbs$cust, size = 500)
-elog <- elog[elog$cust %in% cust.subset, ]
-cbs  <- cbs[cbs$cust %in% cust.subset, ]
-
-#' draw Pareto/GGG parameters and report median estimates
-param.draws <- pggg.mcmc.DrawParameters(
-                 cal.cbs = cbs, 
-                 mcmc = 1500, burnin = 500, chains = 2, thin = 50, 
-                 mc.cores = 1)
-round(summary(param.draws$level_2)$quantiles[, "50%"], 2)
+#' Convert from event log to customer-by-sufficient-statistic summary.
+#' Split into 52 weeks calibration, and 52 weeks holdout period.
+cbs <- elog2cbs(groceryElog, T.cal = "2006-12-31", T.tot = "2007-12-30")
+head(cbs)
 
-#' estimate distribution for future transactions
-xstar.draws <- mcmc.DrawFutureTransactions(
-                 cal.cbs     = cbs, 
-                 draws       = param.draws, 
-                 T.star      = cbs$T.star)
-cbs$xstar.pggg <- apply(xstar.draws, 2, mean)
 
-#' calculate P(alive)
-cbs$palive <- mcmc.PAlive(param.draws)
+x <- readline("Estimate Pareto/NBD (HB) and Pareto/GGG (press Enter)")
 
-#' calculate P(active)
-cbs$pactive <- apply(xstar.draws, 2, function(x) mean(x > 0))
-
-#mcmc.plotPActiveDiagnostic()
-#mcmc.PlotTrackingInc()
-#mcmc.PlotFrequencyInCalibration()
-
-
-x <- readline("Estimate Pareto/GGG and Pareto/NBD (HB) for simulated data (press Enter)")
-
-#' generate artificial Pareto/GGG data
-params <- list(t = 4.5, gamma = 1.5, 
-               r = 5, alpha = 30, 
-               s = 0.8, beta = 12)
-set.seed(1234)
-data <- pggg.GenerateData(
-          n = 1000, 
-          T.cal = 21, 
-          T.star = 21, 
-          params = params, 
-          return.elog = TRUE)
-cbs <- data$cbs
-elog <- data$elog
-
-#' estimate Pareto/NBD (HB) and Pareto/GGG
-pnbd.draws <- pnbd.mcmc.DrawParameters(cbs, mcmc = 1500, burnin = 500, chains = 2, thin = 50, mc.cores = 1)
-pggg.draws <- pggg.mcmc.DrawParameters(cbs, mcmc = 1500, burnin = 500, chains = 2, thin = 50, mc.cores = 1)
-rbind(`Actual`     = params, 
-      `Pareto/GGG` = summary(pggg.draws$level_2)$quantiles[, "50%"],
-      `Pareto/NBD` = c(t=NA, gamma=NA, summary(pnbd.draws$level_2)$quantiles[, "50%"]))
+#' Draw Pareto/NBD parameters and compare median estimates with Pareto/NBD
+#' implementation from BTYD package.
+pnbd.draws <- pnbd.mcmc.DrawParameters(cal.cbs = cbs, mc.cores = 1)
+
+#' Draw Pareto/GGG parameters and report median estimates. Note, that we only
+#' use a relatively short MCMC chain to keep this demo finish reasonably fast.
+pggg.draws <- pggg.mcmc.DrawParameters(cal.cbs = cbs,
+                 mcmc = 500, burnin = 500, chains = 2, thin = 20,
+                 mc.cores = 1)
+
+round(rbind(`Pareto/GGG`= summary(pggg.draws$level_2)$quantiles[, "50%"],
+      `Pareto/NBD (HB)` = c(NA, NA, summary(pnbd.draws$level_2)$quantiles[, "50%"]),
+      `Pareto/NBD`      = c(NA, NA, BTYD::pnbd.EstimateParameters(cbs))), 2)
 
 #' plot estimated parameter distributions
-plot(param.draws$level_2, density = TRUE, trace = FALSE)
+plot(pggg.draws$level_2, density = TRUE, trace = FALSE)
 pggg.plotRegularityRateHeterogeneity(pggg.draws)
+# -> regularity detected in grocery dataset
 
 #' check MCMC convergence
-plot(param.draws$level_2, density = FALSE, trace = TRUE)
+plot(pggg.draws$level_2, density = FALSE, trace = TRUE)
 coda::effectiveSize(pggg.draws$level_2)
 
+
+x <- readline("Estimate future transactions (press Enter)")
+
 #' draw future transaction
-pnbd.xstar.draws <- mcmc.DrawFutureTransactions(cbs, pnbd.draws, T.star = cbs$T.star, sample_size = 400)
-pggg.xstar.draws <- mcmc.DrawFutureTransactions(cbs, pggg.draws, T.star = cbs$T.star, sample_size = 400)
+xstar.pnbd.draws <- mcmc.DrawFutureTransactions(cbs, pnbd.draws, T.star = cbs$T.star, sample_size = 400)
+xstar.pggg.draws <- mcmc.DrawFutureTransactions(cbs, pggg.draws, T.star = cbs$T.star, sample_size = 400)
 
 #' calculate mean over future transaction draws for each customer
 cbs$xstar.pnbd <- apply(pnbd.xstar.draws, 2, mean)
 cbs$xstar.pggg <- apply(pggg.xstar.draws, 2, mean)
 
+#' calculate P(alive)
+cbs$palive.pnbd <- mcmc.PAlive(pnbd.draws)
+cbs$palive.pggg <- mcmc.PAlive(pggg.draws)
+
+#' calculate P(active)
+cbs$pactive.pnbd <- mcmc.PActive(xstar.pnbd.draws)
+cbs$pactive.pggg <- mcmc.PActive(xstar.pggg.draws)
+
 #' compare forecast accuracy to Pareto/NBD
 (mae <- c(`Pareto/GGG` = mean(abs(cbs$x.star - cbs$xstar.pggg)), 
           `Pareto/NBD` = mean(abs(cbs$x.star - cbs$xstar.pnbd))))
 (lift <- 1 - mae[1]/mae[2])
-#' -> 18% lift in customer-level accuracy when taking regularity into account
+#' -> 9% lift in customer-level accuracy when taking regularity into account
 
 # P(active) diagnostic plot
 nil <- mcmc.plotPActiveDiagnostic(cbs, pggg.xstar.draws)
-
-#' Compare incremental transactions in holdout to Pareto/NBD
-inc <- elog2inc(elog, by = 1)
-T.tot <- max(cbs$T.cal + cbs$T.star)
-op <- par(mfrow = c(1, 2))
-nil <- mcmc.PlotTrackingInc(pggg.draws, cbs$T.cal, T.tot, inc, title = "Pareto/GGG", xlab="Days")
-nil <- mcmc.PlotTrackingInc(pnbd.draws, cbs$T.cal, T.tot, inc, title = "Pareto/NBD", xlab="Days")
-par(op)
diff --git a/man/groceryElog.Rd b/man/groceryElog.Rd