Skip to content

Commit

Permalink
Added convergence rate estimator from Politis et al. 1999
Browse files Browse the repository at this point in the history
  • Loading branch information
niklhart committed Feb 26, 2024
1 parent a30f1d4 commit 306c65b
Show file tree
Hide file tree
Showing 7 changed files with 309 additions and 11 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

export(combinations)
export(constDiagMatrix)
export(convergence_rate)
export(kld_ci_bootstrap)
export(kld_ci_subsampling)
export(kld_discrete)
Expand Down
99 changes: 99 additions & 0 deletions R/convergence-rate.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# convergence rate of estimators


#' Empirical convergence rate of a KL divergence estimator
#'
#' Subsampling-based confidence intervals computed by `kld_ci_subsampling()`
#' require the convergence rate of the KL divergence estimator as an input. The
#' default rate of `0.5` assumes that the variance term dominates the bias term.
#' For high-dimensional problems, depending on the data, the convergence rate
#' might be lower. This function allows to empirically derive the convergence
#' rate.
#'
#' Note: Currently, only the one-sample version is implemented.
#'
#' @inheritParams kld_est
#' @param estimator A KL divergence estimator.
#' @param n.sizes Number of different subsample sizes to use (default: `4`).
#' @param spacing.factor Multiplicative factor controlling the spacing of sample
#' sizes (default: `1.5`).
#' @param typical.subsample A function that produces a typical subsample size,
#' used as the geometric mean of subsample sizes (default: `sqrt(n)`).
#' @param b Subsample sizes to be used. The default `NULL` means `b` is computed
#' from `n.sizes`, `spacing.factor` and `typical.subsample`. If a non-`NULL`
#' value for `b` is given, inputs `n.sizes`, `spacing.factor` and
#' `typical.subsample` are ignored.
#' @param B Number of subsamples to draw per subsample size.
#' @param plot A boolean (default: `FALSE`) controlling whether to produce a
#' diagnostic plot visualizing the fit.
#' @returns A scalar, the parameter \eqn{\beta} in the empirical convergence
#' rate \eqn{n^-\beta} of the `estimator` to the true KL divergence.
#' It can be used in the `convergence.rate` argument of `kld_ci_subsampling()`
#' as `convergence.rate = function(n) n^beta`.
#'
#' References:
#'
#' Politis, Romano and Wolf, "Subsampling", Chapter 8 (1999), for theory.
#'
#' The implementation has been adapted from lecture notes by C. J. Geyer,
#' https://www.stat.umn.edu/geyer/5601/notes/sub.pdf
#'
#' @export
convergence_rate <- function(estimator, X, Y = NULL, q = NULL,
n.sizes = 4, spacing.factor = 1.5,
typical.subsample = function(n) sqrt(n),
b = NULL, B = 500L, plot = FALSE) {

two.sample <- is_two_sample(Y, q)

# important dimensions for X and Y
if (is.vector(X)) X <- as.matrix(X)
n <- nrow(X)

# determine subsample sizes from input parameters
if (is.null(b)) {
bmin <- typical.subsample(n) / sqrt((n.sizes-1)*spacing.factor)
b <- floor(bmin * spacing.factor^(0:(n.sizes-1)))
}

if (two.sample) {
stop("Two-sample version not implemented yet.")

} else {

theta.hat <- estimator(X, q = q)

theta.star <- matrix(NA, B, length(b))
for (i in 1:B) {
X.star <- X
# backwards nested subsampling
for (j in length(b):1) {
X.star <- sample(X.star, b[j], replace = FALSE)
theta.star[i, j] <- estimator(X.star, q = q)
}
}

zmat <- theta.star - theta.hat

# calculate quantile differences
l_probs <- seq(0.05, 0.45, by = 0.05)
u_probs <- seq(0.55, 0.95, by = 0.05)
lqmat <- log(apply(zmat, MARGIN = 2, FUN = function(x) quantile(x, u_probs))
- apply(zmat, MARGIN = 2, FUN = function(x) quantile(x, l_probs)))
dimnames(lqmat) <- list(NULL,b)

y <- colMeans(lqmat)
beta <- -cov(y, log(b)) / var(log(b))
}

if (plot) {
inter <- mean(y) + beta * mean(log(b))
plot(rep(b, each = nrow(lqmat)), as.vector(lqmat),
xlab = "Subsample size",
ylab = "log(high quantile - low quantile)",
main = paste0("Empirical convergence rate (beta = ",signif(beta,3),")"),
log = "x")
lines(b, inter-beta*log(b), col = "red")
}
beta
}
10 changes: 5 additions & 5 deletions R/kld-uncertainty.R
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,11 @@ kld_ci_bootstrap <- function(X, Y, estimator = kld_est_kde1, B = 500L,
#' \eqn{1 - \alpha} as long as \eqn{b_n/n\rightarrow 0},
#' \eqn{b_n\rightarrow\infty} and \eqn{\frac{\tau_{b_n}}{\tau_n}\rightarrow 0}.
#'
#' The convergence rate of the nearest-neighbour based KL divergence estimator
#' being \eqn{\tau_n = \sqrt{n}}, the condition on the subsample size reduces to
#' \eqn{b_n/n\rightarrow 0} and \eqn{b_n\rightarrow\infty}. By default,
#' \eqn{b_n = n^{2/3}}. In a two-sample problem, \eqn{n} and \eqn{b_n} are
#' replaced by effective sample sizes \eqn{n_\text{eff} = \min(n,m)} and
#' In many cases, the convergence rate of the nearest-neighbour based KL
#' divergence estimator is \eqn{\tau_n = \sqrt{n}} and the condition on the
#' subsample size reduces to \eqn{b_n/n\rightarrow 0} and \eqn{b_n\rightarrow\infty}.
#' By default, \eqn{b_n = n^{2/3}}. In a two-sample problem, \eqn{n} and \eqn{b_n}
#' are replaced by effective sample sizes \eqn{n_\text{eff} = \min(n,m)} and
#' \eqn{b_{n,\text{eff}} = \min(b_n,b_m)}.
#'
#' Reference:
Expand Down
3 changes: 2 additions & 1 deletion R/kldest-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
"_PACKAGE"

## usethis namespace: start
#' @importFrom stats dnorm qnorm ecdf sd approxfun density quantile
#' @importFrom stats dnorm qnorm ecdf sd approxfun density quantile cov var
#' @importFrom graphics lines
#' @importFrom utils head tail
## usethis namespace: end
NULL
82 changes: 82 additions & 0 deletions man/convergence_rate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions man/kld_ci_subsampling.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

115 changes: 115 additions & 0 deletions scripts/empirical-convergence-rates.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
---
title: "Empirical convergence rate of KL divergence estimators"
author: "Niklas Hartung"
date: "`r Sys.Date()`"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Topic

The subsampling method, used to generate confidence intervals for a KL divergence
estimate, requires the convergence rate of the estimator as an input.

According to literature (Noh 2014, Singh 2016, Zhao 2020), the 1-NN estimator has
a variance of O(1/n) and a dimension-dependent bias. The exact shape of the bias
depends on the smoothness assumptions for the distribution. As a default in the
subsampling method, a $\sqrt{n}$ convergence rate is assumed, which implies that
the bias term is not of larger order than the variance.

As an alternative to using such a theoretically derived convergence rate, it can
be determined empirically using the method described in Politis et al. (1999).

For now, only the one-sample variant is implemented.

### Preliminaries

Load libraries

```{r}
library(kldest)
library(ggplot2)
```

For reproducibility

```{r}
set.seed(0)
```

### Simulation parameters

Global parameters

```{r}
n <- 1000L # sample size
n.sizes <- 5L # number of different subsample sizes
B <- 1000L # number of subsamples to simulate per subsample size
```

Estimators to be evaluated. Currently, the bias-reduced algorithm cannot be used
since it relies on two samples, for which function `convergence_rate` is not
implemented yet.

```{r}
estimators <- list(
"nn1" = kld_est_nn#,
# "brnn1" = function(...) kld_est_brnn(..., warn.max.k = FALSE)
)
nestim <- length(estimators)
```

Different distributions $p$ and $q$:

```{r}
scenarios <- list(
"2-D, independent" = list(mu1 = c(2,0),
sigma1 = diag(c(1,100)),
mu2 = c(0,10),
sigma2 = diag(c(1,100))),
"2-D, corr. diff." = list(mu1 = c(0,0),
sigma1 = constDiagMatrix(dim = 2, diag = 3, offDiag = 1),
mu2 = c(0,0),
sigma2 = constDiagMatrix(dim = 2, diag = 3, offDiag = -1)),
"10-D, corr. diff." = list(mu1 = rep(0,10),
sigma1 = constDiagMatrix(dim = 10, diag = 1, offDiag = 0.999),
mu2 = rep(0,10),
sigma2 = diag(10))
)
```

Compute true KL-D in each scenario (for completeness, not really needed here)

```{r}
(kld_true <- vapply(scenarios, function(x) do.call(kld_gaussian,x), 1))
```

## Simulation study: empirical convergence rate

```{r}
rates <- matrix(nrow = length(scenarios),
ncol = nestim,
dimnames = list(names(scenarios), names(estimators)))
for (i in seq_along(scenarios)) {
p <- scenarios[[i]]
X <- MASS::mvrnorm(n, mu = p$mu1, Sigma = p$sigma1)
q <- function(x) mvdnorm(x, mu = p$mu2, Sigma = p$sigma2)
# Y <- MASS::mvrnorm(n, mu = p$mu2, Sigma = p$sigma2) # not supported yet...
for (j in 1:nestim) {
rates[i,j] <- convergence_rate(estimators[[j]], X = X, q = q,
n.sizes = n.sizes, B = B, plot = TRUE)
}
}
```

$\Rightarrow$ even in high-dimensional examples, the convergence rate is
approximately as expected from the variance term.


0 comments on commit 306c65b

Please sign in to comment.