Added convergence rate estimator from Politis et al. 1999

niklhart · Feb 26, 2024 · 306c65b · 306c65b
1 parent a30f1d4
commit 306c65b
Show file tree

Hide file tree

Showing 7 changed files with 309 additions and 11 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(combinations)
 export(constDiagMatrix)
+export(convergence_rate)
 export(kld_ci_bootstrap)
 export(kld_ci_subsampling)
 export(kld_discrete)

diff --git a/R/convergence-rate.R b/R/convergence-rate.R
@@ -0,0 +1,99 @@
+# convergence rate of estimators
+
+
+#' Empirical convergence rate of a KL divergence estimator
+#'
+#' Subsampling-based confidence intervals computed by `kld_ci_subsampling()`
+#' require the convergence rate of the KL divergence estimator as an input. The
+#' default rate of `0.5` assumes that the variance term dominates the bias term.
+#' For high-dimensional problems, depending on the data, the convergence rate
+#' might be lower. This function allows to empirically derive the convergence
+#' rate.
+#'
+#' Note: Currently, only the one-sample version is implemented.
+#'
+#' @inheritParams kld_est
+#' @param estimator A KL divergence estimator.
+#' @param n.sizes Number of different subsample sizes to use (default: `4`).
+#' @param spacing.factor Multiplicative factor controlling the spacing of sample
+#'     sizes (default: `1.5`).
+#' @param typical.subsample A function that produces a typical subsample size,
+#'     used as the geometric mean of subsample sizes (default: `sqrt(n)`).
+#' @param b Subsample sizes to be used. The default `NULL` means `b` is computed
+#'     from `n.sizes`, `spacing.factor` and `typical.subsample`. If a non-`NULL`
+#'     value for `b` is given, inputs `n.sizes`, `spacing.factor` and
+#'     `typical.subsample` are ignored.
+#' @param B Number of subsamples to draw per subsample size.
+#' @param plot A boolean (default: `FALSE`) controlling whether to produce a
+#'     diagnostic plot visualizing the fit.
+#' @returns A scalar, the parameter \eqn{\beta} in the empirical convergence
+#'     rate \eqn{n^-\beta} of the `estimator` to the true KL divergence.
+#'     It can be used in the `convergence.rate` argument of `kld_ci_subsampling()`
+#'     as `convergence.rate = function(n) n^beta`.
+#'
+#' References:
+#'
+#' Politis, Romano and Wolf, "Subsampling", Chapter 8 (1999), for theory.
+#'
+#' The implementation has been adapted from lecture notes by C. J. Geyer,
+#' https://www.stat.umn.edu/geyer/5601/notes/sub.pdf
+#'
+#' @export
+convergence_rate <- function(estimator, X, Y = NULL, q = NULL,
+                             n.sizes = 4, spacing.factor = 1.5,
+                             typical.subsample = function(n) sqrt(n),
+                             b = NULL, B = 500L, plot = FALSE) {
+
+    two.sample <- is_two_sample(Y, q)
+
+    # important dimensions for X and Y
+    if (is.vector(X)) X <- as.matrix(X)
+    n <- nrow(X)
+
+    # determine subsample sizes from input parameters
+    if (is.null(b)) {
+        bmin <- typical.subsample(n) / sqrt((n.sizes-1)*spacing.factor)
+        b <- floor(bmin * spacing.factor^(0:(n.sizes-1)))
+    }
+
+    if (two.sample) {
+        stop("Two-sample version not implemented yet.")
+
+    } else {
+
+        theta.hat <- estimator(X, q = q)
+
+        theta.star <- matrix(NA, B, length(b))
+        for (i in 1:B) {
+            X.star <- X
+            # backwards nested subsampling
+            for (j in length(b):1) {
+                X.star <- sample(X.star, b[j], replace = FALSE)
+                theta.star[i, j] <- estimator(X.star, q = q)
+            }
+        }
+
+        zmat <- theta.star - theta.hat
+
+        # calculate quantile differences
+        l_probs <- seq(0.05, 0.45, by = 0.05)
+        u_probs <- seq(0.55, 0.95, by = 0.05)
+        lqmat <- log(apply(zmat, MARGIN = 2, FUN = function(x) quantile(x, u_probs))
+                     - apply(zmat, MARGIN = 2, FUN = function(x) quantile(x, l_probs)))
+        dimnames(lqmat) <- list(NULL,b)
+
+        y <- colMeans(lqmat)
+        beta <- -cov(y, log(b)) / var(log(b))
+    }
+
+    if (plot) {
+        inter <- mean(y) + beta * mean(log(b))
+        plot(rep(b, each = nrow(lqmat)), as.vector(lqmat),
+             xlab = "Subsample size",
+             ylab = "log(high quantile - low quantile)",
+             main = paste0("Empirical convergence rate (beta = ",signif(beta,3),")"),
+             log = "x")
+        lines(b, inter-beta*log(b), col = "red")
+    }
+    beta
+}
diff --git a/R/kld-uncertainty.R b/R/kld-uncertainty.R
@@ -106,11 +106,11 @@ kld_ci_bootstrap <- function(X, Y, estimator = kld_est_kde1, B = 500L,
 #' \eqn{1 - \alpha} as long as \eqn{b_n/n\rightarrow 0},
 #' \eqn{b_n\rightarrow\infty} and \eqn{\frac{\tau_{b_n}}{\tau_n}\rightarrow 0}.
 #'
-#' The convergence rate of the nearest-neighbour based KL divergence estimator
-#' being \eqn{\tau_n = \sqrt{n}}, the condition on the subsample size reduces to
-#' \eqn{b_n/n\rightarrow 0} and \eqn{b_n\rightarrow\infty}. By default,
-#' \eqn{b_n = n^{2/3}}. In a two-sample problem, \eqn{n} and \eqn{b_n} are
-#' replaced by effective sample sizes \eqn{n_\text{eff} = \min(n,m)} and
+#' In many cases, the convergence rate of the nearest-neighbour based KL
+#' divergence estimator is \eqn{\tau_n = \sqrt{n}} and the condition on the
+#' subsample size reduces to \eqn{b_n/n\rightarrow 0} and \eqn{b_n\rightarrow\infty}.
+#' By default, \eqn{b_n = n^{2/3}}. In a two-sample problem, \eqn{n} and \eqn{b_n}
+#' are replaced by effective sample sizes \eqn{n_\text{eff} = \min(n,m)} and
 #' \eqn{b_{n,\text{eff}} = \min(b_n,b_m)}.
 #'
 #' Reference:

diff --git a/R/kldest-package.R b/R/kldest-package.R
@@ -2,7 +2,8 @@
 "_PACKAGE"
 
 ## usethis namespace: start
-#' @importFrom stats dnorm qnorm ecdf sd approxfun density quantile
+#' @importFrom stats dnorm qnorm ecdf sd approxfun density quantile cov var
+#' @importFrom graphics lines
 #' @importFrom utils head tail
 ## usethis namespace: end
 NULL
diff --git a/man/convergence_rate.Rd b/man/convergence_rate.Rd
diff --git a/man/kld_ci_subsampling.Rd b/man/kld_ci_subsampling.Rd
diff --git a/scripts/empirical-convergence-rates.Rmd b/scripts/empirical-convergence-rates.Rmd
@@ -0,0 +1,115 @@
+---
+title: "Empirical convergence rate of KL divergence estimators"
+author: "Niklas Hartung"
+date: "`r Sys.Date()`"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Topic
+
+The subsampling method, used to generate confidence intervals for a KL divergence 
+estimate, requires the convergence rate of the estimator as an input. 
+
+According to literature (Noh 2014, Singh 2016, Zhao 2020), the 1-NN estimator has
+a variance of O(1/n) and a dimension-dependent bias. The exact shape of the bias
+depends on the smoothness assumptions for the distribution. As a default in the
+subsampling method, a $\sqrt{n}$ convergence rate is assumed, which implies that 
+the bias term is not of larger order than the variance.
+
+As an alternative to using such a theoretically derived convergence rate, it can
+be determined empirically using the method described in Politis et al. (1999).
+
+For now, only the one-sample variant is implemented.
+
+### Preliminaries
+
+Load libraries
+
+```{r}
+library(kldest)
+library(ggplot2)
+```
+
+For reproducibility
+
+```{r}
+set.seed(0)
+```
+
+### Simulation parameters
+
+Global parameters
+
+```{r}
+n <- 1000L      # sample size
+n.sizes <- 5L   # number of different subsample sizes
+B <- 1000L       # number of subsamples to simulate per subsample size   
+```
+
+Estimators to be evaluated. Currently, the bias-reduced algorithm cannot be used
+since it relies on two samples, for which function `convergence_rate` is not 
+implemented yet.
+
+```{r}
+estimators <- list(
+    "nn1"   = kld_est_nn#,
+#    "brnn1" = function(...) kld_est_brnn(..., warn.max.k = FALSE)
+)
+nestim <- length(estimators)
+```
+
+Different distributions $p$ and $q$:
+
+```{r}
+scenarios <- list(
+    "2-D, independent" = list(mu1    = c(2,0),
+                              sigma1 = diag(c(1,100)),
+                              mu2    = c(0,10),
+                              sigma2 = diag(c(1,100))),
+    "2-D, corr. diff." = list(mu1    = c(0,0),
+                              sigma1 = constDiagMatrix(dim = 2, diag = 3, offDiag = 1),
+                              mu2    = c(0,0),
+                              sigma2 = constDiagMatrix(dim = 2, diag = 3, offDiag = -1)),
+    "10-D, corr. diff." = list(mu1    = rep(0,10),
+                               sigma1 = constDiagMatrix(dim = 10, diag = 1, offDiag = 0.999),
+                               mu2    = rep(0,10),
+                               sigma2 = diag(10))
+)
+```
+
+Compute true KL-D in each scenario (for completeness, not really needed here)
+
+```{r}
+(kld_true <- vapply(scenarios, function(x) do.call(kld_gaussian,x), 1))
+```
+
+## Simulation study: empirical convergence rate
+
+```{r}
+rates <- matrix(nrow = length(scenarios), 
+                ncol = nestim, 
+                dimnames = list(names(scenarios), names(estimators)))
+
+for (i in seq_along(scenarios)) {
+    
+    p <- scenarios[[i]]
+    
+    X <- MASS::mvrnorm(n, mu = p$mu1, Sigma = p$sigma1)
+    q <- function(x) mvdnorm(x, mu = p$mu2, Sigma = p$sigma2)
+    # Y <- MASS::mvrnorm(n, mu = p$mu2, Sigma = p$sigma2)       # not supported yet...
+    
+    for (j in 1:nestim) {
+        rates[i,j] <- convergence_rate(estimators[[j]], X = X, q = q, 
+                                       n.sizes = n.sizes, B = B, plot = TRUE)
+    }
+}
+```
+
+$\Rightarrow$ even in high-dimensional examples, the convergence rate is 
+approximately as expected from the variance term.
+
+