-
Notifications
You must be signed in to change notification settings - Fork 49
/
pareto-nbd-mcmc.R
248 lines (214 loc) · 10 KB
/
pareto-nbd-mcmc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
#' Pareto/NBD (HB) Parameter Draws
#'
#' Returns draws from the posterior distributions of the Pareto/NBD (HB)
#' parameters, on cohort as well as on customer level.
#'
#' See \code{demo('pareto-ggg')} for how to apply this model.
#'
#' @param cal.cbs Calibration period customer-by-sufficient-statistic (CBS)
#' data.frame. It must contain a row for each customer, and columns \code{x}
#' for frequency, \code{t.x} for recency and \code{T.cal} for the total time
#' observed. A correct format can be easily generated based on the complete
#' event log of a customer cohort with \code{\link{elog2cbs}}.
#' @param mcmc Number of MCMC steps.
#' @param burnin Number of initial MCMC steps which are discarded.
#' @param thin Only every \code{thin}-th MCMC step will be returned.
#' @param chains Number of MCMC chains to be run.
#' @param mc.cores Number of cores to use in parallel (Unix only). Defaults to \code{min(chains, detectCores())}.
#' @param use_data_augmentation deprecated
#' @param param_init List of start values for cohort-level parameters.
#' @param trace Print logging statement every \code{trace}-th iteration. Not available for \code{mc.cores > 1}.
#' @return 2-element list:
#' \itemize{
#' \item{\code{level_1 }}{list of \code{\link{mcmc.list}}s, one for each customer, with draws for customer-level parameters \code{lambda}, \code{tau}, \code{z}, \code{mu}}
#' \item{\code{level_2 }}{\code{\link{mcmc.list}}, with draws for cohort-level parameters \code{r}, \code{alpha}, \code{s}, \code{beta}}
#' }
#' @export
#' @seealso \code{\link{pnbd.GenerateData} } \code{\link{mcmc.DrawFutureTransactions} } \code{\link{mcmc.PAlive} }
#' @references Ma, S. H., & Liu, J. L. (2007, August). The MCMC approach for
#' solving the Pareto/NBD model and possible extensions. In Third
#' international conference on natural computation (ICNC 2007) (Vol. 2, pp.
#' 505-512). IEEE. \doi{10.1109/ICNC.2007.728}
#' @references Abe, M. (2009). "Counting your customers" one by one: A
#' hierarchical Bayes extension to the Pareto/NBD model. Marketing Science,
#' 28(3), 541-553. \doi{10.1287/mksc.1090.0502}
#' @examples
#' data("groceryElog")
#' cbs <- elog2cbs(groceryElog, T.cal = "2006-12-31")
#' param.draws <- pnbd.mcmc.DrawParameters(cbs,
#' mcmc = 100, burnin = 50, thin = 10, chains = 1) # short MCMC to run demo fast
#'
#' # cohort-level parameter draws
#' as.matrix(param.draws$level_2)
#' # customer-level parameter draws for customer with ID '4'
#' as.matrix(param.draws$level_1[["4"]])
#'
#' # estimate future transactions
#' xstar.draws <- mcmc.DrawFutureTransactions(cbs, param.draws, cbs$T.star)
#' xstar.est <- apply(xstar.draws, 2, mean)
#' head(xstar.est)
pnbd.mcmc.DrawParameters <- function(cal.cbs, mcmc = 2500, burnin = 500, thin = 50, chains = 2, mc.cores = NULL,
use_data_augmentation = TRUE, param_init = NULL, trace = 100) {
# ** methods to sample heterogeneity parameters {r, alpha, s, beta} **
draw_gamma_params <- function(type, level_1, level_2, hyper_prior) {
if (type == "lambda") {
x <- level_1["lambda", ]
cur_params <- c(level_2["r"], level_2["alpha"])
hyper <- unlist(hyper_prior[c("r_1", "r_2", "alpha_1", "alpha_2")])
} else if (type == "mu") {
x <- level_1["mu", ]
cur_params <- c(level_2["s"], level_2["beta"])
hyper <- unlist(hyper_prior[c("s_1", "s_2", "beta_1", "beta_2")])
}
slice_sample_gamma_parameters(x, cur_params, hyper, steps = 50, w = 0.1)
}
# ** methods to sample individual-level parameters (with data augmentation) **
draw_lambda <- function(data, level_1, level_2) {
N <- nrow(data)
x <- data$x
T.cal <- data$T.cal
tau <- level_1["tau", ]
r <- level_2["r"]
alpha <- level_2["alpha"]
lambda <- rgamma(n = N, shape = r + x, rate = alpha + pmin(tau, T.cal))
lambda[lambda == 0 | log(lambda) < -30] <- exp(-30) # avoid numeric overflow
return(lambda)
}
draw_mu <- function(data, level_1, level_2) {
N <- nrow(data)
tau <- level_1["tau", ]
s <- level_2["s"]
beta <- level_2["beta"]
mu <- rgamma(n = N, shape = s + 1, rate = beta + tau)
mu[mu == 0 | log(mu) < -30] <- exp(-30) # avoid numeric overflow
return(mu)
}
draw_tau <- function(data, level_1) {
N <- nrow(data)
tx <- data$t.x
Tcal <- data$T.cal
lambda <- level_1["lambda", ]
mu <- level_1["mu", ]
mu_lam <- mu + lambda
t_diff <- Tcal - tx
# sample z
p_alive <- 1 / (1 + (mu / mu_lam) * (exp(mu_lam * t_diff) - 1))
alive <- p_alive > runif(n = N)
# sample tau
tau <- numeric(N)
# Case: still alive - left truncated exponential distribution -> [Tcal, Inf]
if (any(alive)) {
tau[alive] <- Tcal[alive] + rexp(sum(alive), mu[alive])
}
# Case: churned - double truncated exponential distribution -> [tx, Tcal]
if (any(!alive)) {
mu_lam_tx <- pmin(700, mu_lam[!alive] * tx[!alive])
mu_lam_Tcal <- pmin(700, mu_lam[!alive] * Tcal[!alive])
# sample with https://en.wikipedia.org/wiki/Inverse_transform_sampling
rand <- runif(n = sum(!alive))
tau[!alive] <- -log((1 - rand) * exp(-mu_lam_tx) + rand * exp(-mu_lam_Tcal)) / mu_lam[!alive] # nolint
}
return(tau)
}
run_single_chain <- function(chain_id = 1, data, hyper_prior) {
## initialize arrays for storing draws ##
nr_of_cust <- nrow(data)
nr_of_draws <- (mcmc - 1) %/% thin + 1
level_2_draws <- array(NA_real_, dim = c(nr_of_draws, 4))
dimnames(level_2_draws)[[2]] <- c("r", "alpha", "s", "beta")
level_1_draws <- array(NA_real_, dim = c(nr_of_draws, 4, nr_of_cust))
dimnames(level_1_draws)[[2]] <- c("lambda", "mu", "tau", "z")
## initialize parameters ##
level_2 <- level_2_draws[1, ]
level_2["r"] <- param_init$r
level_2["alpha"] <- param_init$alpha
level_2["s"] <- param_init$s
level_2["beta"] <- param_init$beta
level_1 <- level_1_draws[1, , ] # nolint
level_1["lambda", ] <- mean(data$x) / mean(ifelse(data$t.x == 0, data$T.cal, data$t.x))
level_1["tau", ] <- data$t.x + 0.5 / level_1["lambda", ]
level_1["z", ] <- as.numeric(level_1["tau", ] > data$T.cal)
level_1["mu", ] <- 1 / level_1["tau", ]
## run MCMC chain ##
for (step in 1:(burnin + mcmc)) {
if (step %% trace == 0)
cat("chain:", chain_id, "step:", step, "of", (burnin + mcmc), "\n")
# store
if ( (step - burnin) > 0 & (step - 1 - burnin) %% thin == 0) {
idx <- (step - 1 - burnin) %/% thin + 1
level_1_draws[idx, , ] <- level_1 # nolint
level_2_draws[idx, ] <- level_2
}
# draw individual-level parameters
level_1["lambda", ] <- draw_lambda(data, level_1, level_2)
level_1["mu", ] <- draw_mu(data, level_1, level_2)
level_1["tau", ] <- draw_tau(data, level_1)
level_1["z", ] <- as.numeric(level_1["tau", ] > data$T.cal)
# draw heterogeneity parameters
level_2[c("r", "alpha")] <- draw_gamma_params("lambda", level_1, level_2, hyper_prior)
level_2[c("s", "beta")] <- draw_gamma_params("mu", level_1, level_2, hyper_prior)
}
# convert MCMC draws into coda::mcmc objects
return(list(
"level_1" = lapply(1:nr_of_cust,
function(i) mcmc(level_1_draws[, , i], start = burnin, thin = thin)), # nolint
"level_2" = mcmc(level_2_draws, start = burnin, thin = thin)))
}
# set hyper priors
hyper_prior <- list(r_1 = 0.001, r_2 = 0.001,
alpha_1 = 0.001, alpha_2 = 0.001,
s_1 = 0.001, s_2 = 0.001,
beta_1 = 0.001, beta_2 = 0.001)
# set param_init (if not passed as argument)
if (is.null(param_init)) {
try({
df <- cal.cbs[sample(nrow(cal.cbs), min(nrow(cal.cbs), 1000)), ]
param_init <- BTYD::pnbd.EstimateParameters(df)
names(param_init) <- c("r", "alpha", "s", "beta")
param_init <- as.list(param_init)
},
silent = TRUE)
if (is.null(param_init))
param_init <- list(r = 1, alpha = 1, s = 1, beta = 1)
cat("set param_init:", paste(round(unlist(param_init), 4), collapse = ", "), "\n")
}
# check whether input data meets requirements
stopifnot(is.data.frame(cal.cbs))
stopifnot(all(c("x", "t.x", "T.cal") %in% names(cal.cbs)))
stopifnot(all(is.finite(cal.cbs$litt)))
# run multiple chains - executed in parallel on Unix
ncores <- ifelse(!is.null(mc.cores), min(chains, mc.cores), ifelse(.Platform$OS.type == "windows", 1, min(chains,
detectCores())))
if (ncores > 1)
cat("running in parallel on", ncores, "cores\n")
draws <- mclapply(1:chains, function(i) run_single_chain(i, cal.cbs, hyper_prior), mc.cores = ncores)
# merge chains into code::mcmc.list objects
out <- list(level_1 = lapply(1:nrow(cal.cbs), function(i) mcmc.list(lapply(draws, function(draw) draw$level_1[[i]]))),
level_2 = mcmc.list(lapply(draws, function(draw) draw$level_2)))
if ("cust" %in% names(cal.cbs))
names(out$level_1) <- cal.cbs$cust
return(out)
}
#' Simulate data according to Pareto/NBD model assumptions
#'
#' @param n Number of customers.
#' @param T.cal Length of calibration period. If a vector is provided, then it
#' is assumed that customers have different 'birth' dates, i.e.
#' \eqn{max(T.cal)-T.cal}.
#' @param T.star Length of holdout period. This may be a vector.
#' @param params A list of model parameters \code{r},
#' \code{alpha}, \code{s}, \code{beta}.
#' @param date.zero Initial date for cohort start. Can be of class character, Date or POSIXt.
#' @return List of length 2:
#' \item{\code{cbs}}{A data.frame with a row for each customer and the summary statistic as columns.}
#' \item{\code{elog}}{A data.frame with a row for each transaction, and columns \code{cust}, \code{date} and \code{t}.}
#' @export
#' @examples
#' params <- list(r = 5, alpha = 10, s = 0.8, beta = 12)
#' data <- pnbd.GenerateData(n = 200, T.cal = 32, T.star = 32, params)
#' cbs <- data$cbs # customer by sufficient summary statistic - one row per customer
#' elog <- data$elog # Event log - one row per event/purchase
pnbd.GenerateData <- function(n, T.cal, T.star, params, date.zero = "2000-01-01") {
params$k <- 1
pggg.GenerateData(n, T.cal, T.star, params, date.zero)
}