/
gp_example.R
288 lines (281 loc) · 11.9 KB
/
gp_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# =========================== gpd_sums_stats ===========================
#' Generalized Pareto summary statistics
#'
#' Calculates summary statistics involved in the Generalized Pareto
#' log-likelihood.
#'
#' @param gpd_data A numeric vector containing positive values.
#' @return A list with components
#' \item{gpd_data}{A numeric vector. The input vector with any missings
#' removed.}
#' \item{m}{A numeric scalar. The sample size, i.e., the number of
#' non-missing values.}
#' \item{xm}{A numeric scalar. The sample maximum}
#' \item{sum_gp}{A numeric scalar. The sum of the non-missing sample
#' values.}
#' @examples
#' \donttest{
#' # Sample data from a GP(sigma, xi) distribution
#' gpd_data <- rgpd(m = 100, xi = 0, sigma = 1)
#' # Calculate summary statistics for use in the log-likelihood
#' ss <- gpd_sum_stats(gpd_data)
#' }
#' @seealso \code{\link{rgpd}} for simulation from a generalized Pareto
#' distribution.
#' @export
gpd_sum_stats <- function(gpd_data) {
ss <- list()
nas <- is.na(gpd_data)
if (any(nas))
warning("Missing values have been removed from the data")
gpd_data <- gpd_data[!nas]
ss$gpd_data <- gpd_data
ss$m <- length(gpd_data) # sample size
ss$xm <- max(gpd_data) # maximum threshold excess
ss$sum_gp <- sum(gpd_data) # sum of threshold excesses
return(ss)
}
# =========================== gpd_logpost ===========================
#' Generalized Pareto posterior log-density
#'
#' Calculates the generalized Pareto posterior log-density based on a particular
#' prior for the generalized Pareto parameters, a Maximal Data Information
#' (MDI) prior truncated to \eqn{\xi \geq -1}{\xi >= -1} in order to produce a
#' posterior density that is proper.
#'
#' @param pars A numeric vector containing the values of the generalized Pareto
#' parameters \eqn{\sigma} and \eqn{\xi}.
#' @param ss A numeric list. Summary statistics to be passed to the generalized
#' Pareto log-likelihood. Calculated using \code{gpd_sum_stats}
#' @return A numeric scalar. The value of the log-likelihood.
#' @seealso \code{\link{gpd_sum_stats}} to calculate summary statistics for
#' use in \code{gpd_loglik}.
#' @seealso \code{\link{rgpd}} for simulation from a generalized Pareto
#' @references Northrop, P. J. and Attalides, N. (2016) Posterior propriety in
#' Bayesian extreme value analyses using reference priors. Statistica Sinica,
#' 26(2), 721-743, \doi{10.5705/ss.2014.034}.
#' @examples
#' \donttest{
#' # Sample data from a GP(sigma, xi) distribution
#' gpd_data <- rgpd(m = 100, xi = 0, sigma = 1)
#' # Calculate summary statistics for use in the log-likelihood
#' ss <- gpd_sum_stats(gpd_data)
#' # Calculate the generalized Pareto log-posterior
#' gpd_logpost(pars = c(1, 0), ss = ss)
#' }
#' @export
gpd_logpost <- function(pars, ss) {
loglik <- do.call(gpd_loglik, c(list(pars = pars), ss))
logprior <- log_gpd_mdi_prior(pars = pars)
return(loglik + logprior)
}
# =========================== rgpd ===========================
#' Generalized Pareto simulation
#'
#' Simulates a sample of size \code{m} from a generalized Pareto distribution.
#'
#' @param m A numeric scalar. The size of sample required.
#' @param sigma A numeric scalar. The generalized Pareto scale parameter
#' \eqn{\sigma}.
#' @param xi A numeric scalar. The generalized Pareto shape parameter
#' \eqn{\xi}.
#' @return A numeric vector. A generalized Pareto sample of size \code{m}.
#' @examples
#' \donttest{
#' # Sample data from a GP(sigma, xi) distribution
#' gpd_data <- rgpd(m = 100, xi = 0, sigma = 1)
#' }
#' @export
rgpd <- function (m = 1, sigma = 1, xi = 0) {
if (min(sigma) <= 0) {
stop("sigma must be positive")
}
if (length(xi) != 1) {
stop("xi must be scalar")
}
if (xi==0) {
return(sigma * stats::rexp(m))
} else {
return(sigma * (stats::runif(m) ^ (-xi) - 1) / xi)
}
}
# =========================== gpd_init ===========================
#' Initial estimates for Generalized Pareto parameters
#'
#' Calculates initial estimates and estimated standard errors (SEs) for the
#' generalized Pareto parameters \eqn{\sigma} and \eqn{\xi} based on an
#' assumed random sample from this distribution. Also, calculates
#' initial estimates and estimated standard errors for
#' \ifelse{html}{\eqn{\phi}\out{<sub>1</sub>} = \eqn{\sigma}}{
#' \eqn{\phi_1 = \sigma}}
#' and
#' \ifelse{html}{\eqn{\phi}\out{<sub>2</sub>} = \eqn{\xi + \sigma x}
#' \out{<sub>(m)</sub>}}{\eqn{\phi_1 = \xi + \sigma x_{(m)}}}, where
#' \ifelse{html}{\eqn{x}\out{<sub>(m)</sub>}}{\eqn{x_{(m)}}} is the sample
#' maximum threshold exceedance.
#'
#' @param gpd_data A numeric vector containing positive sample values.
#' @param m A numeric scalar. The sample size, i.e., the length of
#' \code{gpd_data}.
#' @param xm A numeric scalar. The sample maximum.
#' @param sum_gp A numeric scalar. The sum of the sample values.
#' @param xi_eq_zero A logical scalar. If TRUE assume that the shape
#' parameter \eqn{\xi = 0}.
#' @param init_ests A numeric vector. Initial estimate of
#' \eqn{\theta = (\sigma, \xi)}. If supplied \code{gpd_init()}
#' returns the corresponding initial estimate of
#' \ifelse{html}{\eqn{\phi} = (\eqn{\phi}\out{<sub>1</sub>},
#' \eqn{\phi}\out{<sub>2</sub>})}{\eqn{\phi = (\phi_1, \phi_2)}}.
#' @details The main aim is to calculate an admissible estimate of
#' \eqn{\theta}, i.e., one at which the log-likelihood is finite (necessary
#' for the posterior log-density to be finite) at the estimate, and
#' associated estimated SEs. These are converted into estimates and SEs for
#' \eqn{\phi}. The latter can be used to set values of \code{min_phi} and
#' \code{max_phi} for input to \code{find_lambda}.
#'
#' In the default setting (\code{xi_eq_zero = FALSE} and
#' \code{init_ests = NULL}) the methods tried are Maximum Likelihood
#' Estimation (MLE) (Grimshaw, 1993), Probability-Weighted Moments (PWM)
#' (Hosking and Wallis, 1987) and Linear Combinations of Ratios of Spacings
#' (LRS) (Reiss and Thomas, 2007, page 134) in that order.
#'
#' For \eqn{\xi < -1} the likelihood is unbounded, MLE may fail when
#' \eqn{\xi} is not greater than \eqn{-0.5} and the observed Fisher
#' information for \eqn{(sigma, xi)} has finite variance only if
#' \eqn{\xi > -0.25}. We use the ML estimate provided that
#' the estimate of \eqn{\xi} returned from \code{gpd_mle} is greater than
#' \eqn{-1}. We only use the SE if the MLE of \eqn{\xi} is greater than
#' \eqn{-0.25}.
#'
#' If either the MLE or the SE are not OK then we try PWM. We use the PWM
#' estimate only if is admissible, and the MLE was not OK. We use the PWM SE,
#' but this will be \code{c(NA, NA)} is the PWM estimate of \eqn{\xi} is
#' \eqn{> 1/2}. If the estimate is still not OK then we try LRS. As a last
#' resort, which will tend to occur only when \eqn{\xi} is strongly negative,
#' we set \eqn{\xi = -1} and estimate sigma conditional on this.
#' @return If \code{init_ests} is not supplied by the user, a list is returned
#' with components
#' \item{init}{A numeric vector. Initial estimates of \eqn{\sigma}
#' and \eqn{\xi}.}
#' \item{se}{A numeric vector. Estimated standard errors of
#' \eqn{\sigma} and \eqn{\xi}.}
#' \item{init_phi}{A numeric vector. Initial estimates of
#' \ifelse{html}{\eqn{\phi}\out{<sub>1</sub>} = \eqn{\sigma}}{
#' \eqn{\phi_1 = \sigma}}
#' and
#' \ifelse{html}{\eqn{\phi}\out{<sub>2</sub>} = \eqn{\xi + \sigma x}
#' \out{<sub>(m)</sub>}}{\eqn{\phi_1 = \xi + \sigma x_{(m)}}}
#' where \ifelse{html}{\eqn{x}\out{<sub>(m)</sub>}}{\eqn{x_{(m)}}}
#' is the maximum of \code{gpd_data}.}
#' \item{se_phi}{A numeric vector. Estimated standard errors of
#' \ifelse{html}{\eqn{\phi}\out{<sub>1</sub>}}{\eqn{\phi_1}}
#' and
#' \ifelse{html}{\eqn{\phi}\out{<sub>1</sub>}}{\eqn{\phi_2}}.}
#' If \code{init_ests} is supplied then only the numeric vector
#' \code{init_phi} is returned.
#' @references Grimshaw, S. D. (1993) Computing Maximum Likelihood Estimates
#' for the Generalized Pareto Distribution. Technometrics, 35(2), 185-191.
#' and Computing (1991) 1, 129-133. \doi{10.1007/BF01889987}.
#' @references Hosking, J. R. M. and Wallis, J. R. (1987) Parameter and Quantile
#' Estimation for the Generalized Pareto Distribution. Technometrics, 29(3),
#' 339-349. \doi{10.2307/1269343}.
#' @references Reiss, R.-D., Thomas, M. (2007) Statistical Analysis of Extreme Values
#' with Applications to Insurance, Finance, Hydrology and Other Fields.Birkhauser.
#' \doi{10.1007/978-3-7643-7399-3}.
#' @seealso \code{\link{gpd_sum_stats}} to calculate summary statistics for
#' use in \code{gpd_loglik}.
#' @seealso \code{\link{rgpd}} for simulation from a generalized Pareto
#' @seealso \code{\link{find_lambda}} to produce (somewhat) automatically
#' a list for the argument \code{lambda} of \code{ru}.
#' @examples
#' \donttest{
#' # Sample data from a GP(sigma, xi) distribution
#' gpd_data <- rgpd(m = 100, xi = 0, sigma = 1)
#' # Calculate summary statistics for use in the log-likelihood
#' ss <- gpd_sum_stats(gpd_data)
#' # Calculate initial estimates
#' do.call(gpd_init, ss)
#' }
#' @export
gpd_init <- function(gpd_data, m, xm, sum_gp = NULL, xi_eq_zero = FALSE,
init_ests = NULL) {
#
theta_to_phi <- function(theta) c(theta[1], theta[2] + theta[1] / xm)
#
if (!is.null(init_ests)) {
return(theta_to_phi(init_ests))
}
if (xi_eq_zero) {
s_hat <- mean(gpd_data)
init <- c(s_hat, 0)
init_phi <- theta_to_phi(init)
cov_mtx <- matrix(c(2*s_hat^2, -s_hat, -s_hat, 1), 2, 2) / m
se <- sqrt(diag(cov_mtx))
mat <- matrix(c(1, 0, 1 / xm, 1), 2, 2, byrow = TRUE)
var_phi <- mat %*% cov_mtx %*% t(mat)
se_phi <- sqrt(diag(var_phi))
return(list(init = init, se = se, init_phi = init_phi, se_phi = se_phi))
}
#
ests_ok <- ses_ok <- FALSE
# First try MLE
temp <- gpd_mle(gpd_data)
init <- temp$mle
# Check that the MLE is OK
if (!is.na(init[1]) & init[2] > -1 & init[2] > -init[1]/xm &
!is.infinite(temp$nllh)){
ests_ok <- TRUE
# Check whether or not we should use the SE
if (init[2] > -0.25){
cov_mtx <- solve(gpd_obs_info(init, gpd_data))
se <- sqrt(diag(cov_mtx))
mat <- matrix(c(1, 0, 1 / xm, 1), 2, 2, byrow = TRUE)
var_phi <- mat %*% cov_mtx %*% t(mat)
if (all(diag(var_phi) > 0)){
se_phi <- sqrt(diag(var_phi))
ses_ok <- TRUE
}
}
}
if (!ests_ok | !ses_ok){
# Try PWM
if (!requireNamespace("revdbayes", quietly = TRUE)) {
stop("revdbayes needed for this function to work. Please install it.",
call. = FALSE)
}
pwm <- revdbayes::gp_pwm(gpd_data)
se <- pwm$se
mat <- matrix(c(1, 0, 1 / xm, 1), 2, 2, byrow = TRUE)
var_phi <- mat %*% pwm$cov %*% t(mat)
se_phi <- sqrt(diag(var_phi))
# Note: se and se_phi will be NA if pwm$est[2] > 1/2
check <- gpd_loglik(pars = pwm$est, gpd_data = gpd_data, m = m, xm = xm,
sum_gp = sum_gp)
# If MLE wasn't OK and PWM estimate is OK then use PWM estimate
if (!ests_ok & init[2] > -1 & !is.infinite(check)) {
init <- pwm$est
ests_ok <- TRUE
}
}
# If estimate is not OK then try LRS
if (!ests_ok){
if (!requireNamespace("revdbayes", quietly = TRUE)) {
stop("revdbayes needed for this function to work. Please install it.",
call. = FALSE)
}
init <- revdbayes::gp_lrs(gpd_data)
check <- gpd_loglik(pars = init, gpd_data = gpd_data, m = m, xm = xm,
sum_gp = sum_gp)
if (init[2] > -1 & !is.infinite(check)) {
ests_ok <- TRUE
}
}
# If we get here without ests_ok = TRUE then the posterior mode is
# probably close to xi = -1. We want to avoid xi < -1 so we set
# xi = -1 and use the (bias-adjusted) mle of sigma conditional on xi = -1.
if (!ests_ok) {
init <- c(xm * (m + 1) / m, -1)
}
init_phi <- theta_to_phi(init)
return(list(init = init, se = se, init_phi = init_phi, se_phi = se_phi))
}