R/pareto-nbd-mcmc.R


#' Pareto/NBD (HB) Parameter Draws
#'
#' Returns draws from the posterior distributions of the Pareto/NBD (HB)
#' parameters, on cohort as well as on customer level.
#'
#' See \code{demo('pareto-ggg')} for how to apply this model.
#'
#' @param cal.cbs Calibration period customer-by-sufficient-statistic (CBS)
#'   data.frame. It must contain a row for each customer, and columns \code{x}
#'   for frequency, \code{t.x} for recency and \code{T.cal} for the total time
#'   observed. A correct format can be easily generated based on the complete
#'   event log of a customer cohort with \code{\link{elog2cbs}}.
#' @param mcmc Number of MCMC steps.
#' @param burnin Number of initial MCMC steps which are discarded.
#' @param thin Only every \code{thin}-th MCMC step will be returned.
#' @param chains Number of MCMC chains to be run.
#' @param mc.cores Number of cores to use in parallel (Unix only). Defaults to \code{min(chains, detectCores())}.
#' @param use_data_augmentation deprecated
#' @param param_init List of start values for cohort-level parameters.
#' @param trace Print logging statement every \code{trace}-th iteration. Not available for \code{mc.cores > 1}.
#' @return 2-element list:
#' \itemize{
#'  \item{\code{level_1 }}{list of \code{\link{mcmc.list}}s, one for each customer, with draws for customer-level parameters \code{lambda}, \code{tau}, \code{z}, \code{mu}}
#'  \item{\code{level_2 }}{\code{\link{mcmc.list}}, with draws for cohort-level parameters \code{r}, \code{alpha}, \code{s}, \code{beta}}
#' }
#' @export
#' @seealso \code{\link{pnbd.GenerateData} } \code{\link{mcmc.DrawFutureTransactions} } \code{\link{mcmc.PAlive} }
#' @references Ma, S. H., & Liu, J. L. (2007, August). The MCMC approach for
#'   solving the Pareto/NBD model and possible extensions. In Third
#'   international conference on natural computation (ICNC 2007) (Vol. 2, pp.
#'   505-512). IEEE. \doi{10.1109/ICNC.2007.728}
#' @references Abe, M. (2009). "Counting your customers" one by one: A
#'   hierarchical Bayes extension to the Pareto/NBD model. Marketing Science,
#'   28(3), 541-553. \doi{10.1287/mksc.1090.0502}
#' @examples
#' data("groceryElog")
#' cbs <- elog2cbs(groceryElog, T.cal = "2006-12-31")
#' param.draws <- pnbd.mcmc.DrawParameters(cbs,
#'   mcmc = 100, burnin = 50, thin = 10, chains = 1) # short MCMC to run demo fast
#'
#' # cohort-level parameter draws
#' as.matrix(param.draws$level_2)
#' # customer-level parameter draws for customer with ID '4'
#' as.matrix(param.draws$level_1[["4"]])
#'
#' # estimate future transactions
#' xstar.draws <- mcmc.DrawFutureTransactions(cbs, param.draws, cbs$T.star)
#' xstar.est <- apply(xstar.draws, 2, mean)
#' head(xstar.est)
pnbd.mcmc.DrawParameters <- function(cal.cbs, mcmc = 2500, burnin = 500, thin = 50, chains = 2, mc.cores = NULL,
  use_data_augmentation = TRUE, param_init = NULL, trace = 100) {

  # ** methods to sample heterogeneity parameters {r, alpha, s, beta} **

  draw_gamma_params <- function(type, level_1, level_2, hyper_prior) {
    if (type == "lambda") {
      x <- level_1["lambda", ]
      cur_params <- c(level_2["r"], level_2["alpha"])
      hyper <- unlist(hyper_prior[c("r_1", "r_2", "alpha_1", "alpha_2")])
    } else if (type == "mu") {
      x <- level_1["mu", ]
      cur_params <- c(level_2["s"], level_2["beta"])
      hyper <- unlist(hyper_prior[c("s_1", "s_2", "beta_1", "beta_2")])
    }
    slice_sample_gamma_parameters(x, cur_params, hyper, steps = 50, w = 0.1)
  }

  # ** methods to sample individual-level parameters (with data augmentation) **

  draw_lambda <- function(data, level_1, level_2) {
    N <- nrow(data)
    x <- data$x
    T.cal <- data$T.cal
    tau <- level_1["tau", ]
    r <- level_2["r"]
    alpha <- level_2["alpha"]

    lambda <- rgamma(n = N, shape = r + x, rate = alpha + pmin(tau, T.cal))
    lambda[lambda == 0 | log(lambda) < -30] <- exp(-30)  # avoid numeric overflow
    return(lambda)
  }

  draw_mu <- function(data, level_1, level_2) {
    N <- nrow(data)
    tau <- level_1["tau", ]
    s <- level_2["s"]
    beta <- level_2["beta"]

    mu <- rgamma(n = N, shape = s + 1, rate = beta + tau)
    mu[mu == 0 | log(mu) < -30] <- exp(-30)  # avoid numeric overflow
    return(mu)
  }

  draw_tau <- function(data, level_1) {
    N <- nrow(data)
    tx <- data$t.x
    Tcal <- data$T.cal
    lambda <- level_1["lambda", ]
    mu <- level_1["mu", ]

    mu_lam <- mu + lambda
    t_diff <- Tcal - tx

    # sample z
    p_alive <- 1 / (1 + (mu / mu_lam) * (exp(mu_lam * t_diff) - 1))
    alive <- p_alive > runif(n = N)

    # sample tau
    tau <- numeric(N)

    # Case: still alive - left truncated exponential distribution -> [Tcal, Inf]
    if (any(alive)) {
      tau[alive] <- Tcal[alive] + rexp(sum(alive), mu[alive])
    }

    # Case: churned - double truncated exponential distribution -> [tx, Tcal]
    if (any(!alive)) {
      mu_lam_tx <- pmin(700, mu_lam[!alive] * tx[!alive])
      mu_lam_Tcal <- pmin(700, mu_lam[!alive] * Tcal[!alive])
      # sample with https://en.wikipedia.org/wiki/Inverse_transform_sampling
      rand <- runif(n = sum(!alive))
      tau[!alive] <- -log((1 - rand) * exp(-mu_lam_tx) + rand * exp(-mu_lam_Tcal)) / mu_lam[!alive] # nolint
    }

    return(tau)
  }

  run_single_chain <- function(chain_id = 1, data, hyper_prior) {

    ## initialize arrays for storing draws ##

    nr_of_cust <- nrow(data)
    nr_of_draws <- (mcmc - 1) %/% thin + 1
    level_2_draws <- array(NA_real_, dim = c(nr_of_draws, 4))
    dimnames(level_2_draws)[[2]] <- c("r", "alpha", "s", "beta")
    level_1_draws <- array(NA_real_, dim = c(nr_of_draws, 4, nr_of_cust))
    dimnames(level_1_draws)[[2]] <- c("lambda", "mu", "tau", "z")

    ## initialize parameters ##

    level_2 <- level_2_draws[1, ]
    level_2["r"] <- param_init$r
    level_2["alpha"] <- param_init$alpha
    level_2["s"] <- param_init$s
    level_2["beta"] <- param_init$beta

    level_1 <- level_1_draws[1, , ] # nolint
    level_1["lambda", ] <- mean(data$x) / mean(ifelse(data$t.x == 0, data$T.cal, data$t.x))
    level_1["tau", ] <- data$t.x + 0.5 / level_1["lambda", ]
    level_1["z", ] <- as.numeric(level_1["tau", ] > data$T.cal)
    level_1["mu", ] <- 1 / level_1["tau", ]

    ## run MCMC chain ##

    for (step in 1:(burnin + mcmc)) {
      if (step %% trace == 0)
        cat("chain:", chain_id, "step:", step, "of", (burnin + mcmc), "\n")

      # store
      if ( (step - burnin) > 0 & (step - 1 - burnin) %% thin == 0) {
        idx <- (step - 1 - burnin) %/% thin + 1
        level_1_draws[idx, , ] <- level_1 # nolint
        level_2_draws[idx, ] <- level_2
      }

      # draw individual-level parameters
      level_1["lambda", ] <- draw_lambda(data, level_1, level_2)
      level_1["mu", ] <- draw_mu(data, level_1, level_2)
      level_1["tau", ] <- draw_tau(data, level_1)
      level_1["z", ] <- as.numeric(level_1["tau", ] > data$T.cal)

      # draw heterogeneity parameters
      level_2[c("r", "alpha")] <- draw_gamma_params("lambda", level_1, level_2, hyper_prior)
      level_2[c("s", "beta")] <- draw_gamma_params("mu", level_1, level_2, hyper_prior)
    }

    # convert MCMC draws into coda::mcmc objects
    return(list(
      "level_1" = lapply(1:nr_of_cust,
                         function(i) mcmc(level_1_draws[, , i], start = burnin, thin = thin)), # nolint
      "level_2" = mcmc(level_2_draws, start = burnin, thin = thin)))
  }

  # set hyper priors
  hyper_prior <- list(r_1 = 0.001, r_2 = 0.001,
                      alpha_1 = 0.001, alpha_2 = 0.001,
                      s_1 = 0.001, s_2 = 0.001,
                      beta_1 = 0.001, beta_2 = 0.001)

  # set param_init (if not passed as argument)
  if (is.null(param_init)) {
    try({
        df <- cal.cbs[sample(nrow(cal.cbs), min(nrow(cal.cbs), 1000)), ]
        param_init <- BTYD::pnbd.EstimateParameters(df)
        names(param_init) <- c("r", "alpha", "s", "beta")
        param_init <- as.list(param_init)
      },
      silent = TRUE)
    if (is.null(param_init))
      param_init <- list(r = 1, alpha = 1, s = 1, beta = 1)
    cat("set param_init:", paste(round(unlist(param_init), 4), collapse = ", "), "\n")
  }

  # check whether input data meets requirements
  stopifnot(is.data.frame(cal.cbs))
  stopifnot(all(c("x", "t.x", "T.cal") %in% names(cal.cbs)))
  stopifnot(all(is.finite(cal.cbs$litt)))

  # run multiple chains - executed in parallel on Unix
  ncores <- ifelse(!is.null(mc.cores), min(chains, mc.cores), ifelse(.Platform$OS.type == "windows", 1, min(chains,
    detectCores())))
  if (ncores > 1)
    cat("running in parallel on", ncores, "cores\n")
  draws <- mclapply(1:chains, function(i) run_single_chain(i, cal.cbs, hyper_prior), mc.cores = ncores)

  # merge chains into code::mcmc.list objects
  out <- list(level_1 = lapply(1:nrow(cal.cbs), function(i) mcmc.list(lapply(draws, function(draw) draw$level_1[[i]]))),
    level_2 = mcmc.list(lapply(draws, function(draw) draw$level_2)))
  if ("cust" %in% names(cal.cbs))
    names(out$level_1) <- cal.cbs$cust
  return(out)
}


#' Simulate data according to Pareto/NBD model assumptions
#'
#' @param n Number of customers.
#' @param T.cal Length of calibration period. If a vector is provided, then it
#'   is assumed that customers have different 'birth' dates, i.e.
#'   \eqn{max(T.cal)-T.cal}.
#' @param T.star Length of holdout period. This may be a vector.
#' @param params A list of model parameters \code{r},
#'   \code{alpha}, \code{s}, \code{beta}.
#' @param date.zero Initial date for cohort start. Can be of class character, Date or POSIXt.
#' @return List of length 2:
#' \item{\code{cbs}}{A data.frame with a row for each customer and the summary statistic as columns.}
#' \item{\code{elog}}{A data.frame with a row for each transaction, and columns \code{cust}, \code{date} and \code{t}.}
#' @export
#' @examples
#' params <- list(r = 5, alpha = 10, s = 0.8, beta = 12)
#' data <- pnbd.GenerateData(n = 200, T.cal = 32, T.star = 32, params)
#' cbs <- data$cbs  # customer by sufficient summary statistic - one row per customer
#' elog <- data$elog  # Event log - one row per event/purchase
pnbd.GenerateData <- function(n, T.cal, T.star, params, date.zero = "2000-01-01") {
  params$k <- 1
  pggg.GenerateData(n, T.cal, T.star, params, date.zero)
}