# Package Imports and Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import numpy as np
import scipy as sp
import statsmodels.api as sm


Bad key "text.kerning_factor" on line 4 in
/Users/michaelnowotny/anaconda3/envs/continuous_time_mcmc/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
from divergence import *

# Distributions and Samples

This example considers two different normal distributions $p$ and $q$ with
$p = N(2, 9)$ and $q = N(1, 4)$.

In [4]:
# fix random seed for reproducibility
np.random.seed(42)

# set parameters of the normal distributions p and q
mu_p = 2
sigma_p = 3
mu_q = 1
sigma_q = 2

# draw samples from each normal distribution
n = 10000

def draw_normal(mu, sigma, n: int, antithetic: bool = False):
    z = np.random.randn(n)
    if antithetic: 
        z = np.hstack((z, -z))
    
    return mu + sigma * z

samples_p = draw_normal(mu_p, sigma_p, n=n, antithetic=True)
samples_q = draw_normal(mu_q, sigma_q, n=n, antithetic=True)

# fit a non-parametric density estimate for both distributions
kde_p = sm.nonparametric.KDEUnivariate(samples_p)
kde_q = sm.nonparametric.KDEUnivariate(samples_q)
kde_p.fit()
kde_q.fit()

# construct exact normal densities for p and q
pdf_p = lambda x: sp.stats.norm.pdf(x, mu_p, sigma_p)
pdf_q = lambda x: sp.stats.norm.pdf(x, mu_q, sigma_q)

# compute support for kernel density estimates
p_min = min(kde_p.support)
p_max = max(kde_p.support)
q_min = min(kde_q.support)
q_max = max(kde_q.support)
combined_min = min(p_min, q_min)
combined_max = max(p_max, q_max)

# Entropy

The entropy of a probability distribution $p$ is defined as 

$H(X) = - \mathbb{E}_p \left[ \log_{\text{base}} p \right]$, 

where $\mathbb{E}_P$ denotes expectation with respect the probability distribution $p$. In information theory, the base of the logarithm is 2 and the interpretation of entropy is the average number of bits needed to optimally encode the signal represented by the distribution $p$. 

Divergence defaults to $\text{base}=e$, which results in the natural logarithm i.e. $\log_e = \ln$. This default choice can be overridden by specifying a different logarithmic function than the natural logarithm in the argument 'log_fun' during the entropy calculation. In particular, specifying $\text{base}=2$ by setting 'log_fun=np.log2' results in the classical Shannon entropy expressed in bits, whereas specifying $\text{base}=10$ by setting 'log_fun=np.log10' produces the entropy in decimal bits (dits or Hartleys).

## Entropy from Statsmodels KDE Objects (via Statsmodels)

In [5]:
print(f'Entropy of p = {kde_p.entropy}')
print(f'Entropy of q = {kde_q.entropy}')

Entropy of p = 2.531114322639585
Entropy of q = 2.1233454054445


## Entropy from Statsmodels KDE Objects (via Divergence)

In [6]:
print(f'Entropy of p = {compute_entropy_from_kde(kde_p)}')
print(f'Entropy of q = {compute_entropy_from_kde(kde_q)}')

Entropy of p = 2.5311099866509332
Entropy of q = 2.1233433783535673


## Entropy from Normal Probability Density Functions

In [7]:
print(f'Entropy of p = {compute_entropy_from_density_with_support(pdf_p, p_min, p_max)}')
print(f'Entropy of q = {compute_entropy_from_density_with_support(pdf_q, q_min, q_max)}')

Entropy of p = 2.5173904231265363
Entropy of q = 2.1120728496363306


## Theoretical Entropy of a Normal Distribution

In [8]:
def theoretical_entropy_of_normal_distribution(mu: float, sigma: float, log_fun: tp.Callable = np.log) -> float:
    return 0.5 * (1.0 + log_fun(2 * np.pi * sigma**2))

print(f'Entropy of p = {theoretical_entropy_of_normal_distribution(mu_p, sigma_p)}')
print(f'Entropy of q = {theoretical_entropy_of_normal_distribution(mu_q, sigma_q)}')

Entropy of p = 2.5175508218727822
Entropy of q = 2.112085713764618


# Cross Entropy

The cross entropy of a distribution $q$ relative to a distribution $p$ is defined as  

$H(p, q) = - \mathbb{E}_p \left[ \log_{\text{base}} q \right]$.

With a base of 2, the cross-entropy of $q$ relative to $p$ is the average number of bits required to encode the signal in $p$ using a code optimized for the signal in $q$.

## Cross Entropy from Statsmodels KDE Objects

In [9]:
print(f'Cross Entropy of p relative to q = {compute_cross_entropy_from_kde(kde_p, kde_q)}')
print(f'Cross Entropy of q relative to p = {compute_cross_entropy_from_kde(kde_q, kde_p)}')

Cross Entropy of p relative to q = 2.9007913519550996
Cross Entropy of q relative to p = 2.3060943540378385


## Cross Entropy from Normal Probability Density Functions

In [10]:
print(f'Cross Entropy of p relative to q = {compute_cross_entropy_from_densities_with_support(pdf_p, pdf_q, combined_min, combined_max)}')
print(f'Cross Entropy of q relative to p = {compute_cross_entropy_from_densities_with_support(pdf_q, pdf_p, combined_min, combined_max)}')

Cross Entropy of p relative to q = 2.861760799072692
Cross Entropy of q relative to p = 2.295328590629144


# Relative Entropy (Kullback-Leibler Divergence)

Relative entropy or Kullback-Leibler divergence measures the dispersion of two probability distributions $P$ and $Q$. It is defined as the difference between the cross entropy of $q$ relative to $p$ and the entropy of $p$

$D_{KL} (P||Q) = \mathbb{E}_p \left[ \log_{\text{base}} \left( \frac{p}{q} \right) \right] = H(p, q) - H(p)$.

With a base of 2, it can be interpreted as the average number of additional bits required to encode the signal in $p$ using a code optimized for the signal in $q$ over and above the number of bits required by the optimal code for $p$.

## Relative Entropy from Statsmodels KDE Objects

In [11]:
print(f'Relative Entropy of p relative to q = {compute_relative_entropy_from_kde(kde_p, kde_q)}')
print(f'Relative Entropy of q relative to p = {compute_relative_entropy_from_kde(kde_q, kde_p)}')

Relative Entropy of p relative to q = 0.36968136530318224
Relative Entropy of q relative to p = 0.18274894856540888


## Relative Entropy from Normal Probability Density Functions

In [12]:
print(f'Relative Entropy from p to q = {compute_relative_entropy_from_densities_with_support(pdf_p, pdf_q, combined_min, combined_max)}')
print(f'Relative Entropy from q to p = {compute_relative_entropy_from_densities_with_support(pdf_q, pdf_p, combined_min, combined_max)}')

Relative Entropy from p to q = 0.3443703759461555
Relative Entropy from q to p = 0.18324289254428677


## Theoretical Relative Entropy for Normal Distributions

In [13]:
def relative_entropy_between_normal_distributions(mu_1, sigma_1, mu_2, sigma_2, log_fun: tp.Callable = np.log):
    return ((mu_1 - mu_2)**2 + sigma_1**2 - sigma_2**2 ) / (2 * sigma_2**2) + log_fun(sigma_2/sigma_1)

print(f'Relative Entropy from p to q = {relative_entropy_between_normal_distributions(mu_p, sigma_p, mu_q, sigma_q)}')
print(f'Relative Entropy from q to p = {relative_entropy_between_normal_distributions(mu_q, sigma_q, mu_p, sigma_p)}')

Relative Entropy from p to q = 0.34453489189183556
Relative Entropy from q to p = 0.18324288588594217


# Jensen-Shannon Divergence

The Jensen-Shannon divergence, a symmetric measure of the divergence of probability distributions, is defined as

$JSD(p||q) = \frac{1}{2} D_{KL} (p||m) + \frac{1}{2} D_{KL} (q||m)$, 

where $m = \frac{1}{2} \left( p + q \right)$.

For base 2, the JSD is bounded between 0 and 1. For base $e$, it is bounded between $0$ and $\ln(2)$.

## Jensen-Shannon Divergence from Statsmodels KDE Objects

In [14]:
print(f'Jensen-Shannon Divergence between p and q = {compute_jensen_shannon_divergence_from_kde(kde_p, kde_q)}')
print(f'Jensen-Shannon Divergence between q and p = {compute_jensen_shannon_divergence_from_kde(kde_q, kde_p)}')

Jensen-Shannon Divergence between p and q = 0.0525506348426028
Jensen-Shannon Divergence between q and p = 0.0525506348426028


## Jensen-Shannon Divergence from Normal Probability Density Functions

In [15]:
print(f'Jensen-Shannon Divergence between p and q = {compute_jensen_shannon_divergence_from_densities_with_support(pdf_p, pdf_q, combined_min, combined_max)}')
print(f'Jensen-Shannon Divergence between q and p = {compute_jensen_shannon_divergence_from_densities_with_support(pdf_q, pdf_p, combined_min, combined_max)}')

Jensen-Shannon Divergence between p and q = 0.05290044224944204
Jensen-Shannon Divergence between q and p = 0.05290044224944204


## Jensen-Shannon Divergence from Statsmodels KDE Objects in Bits

In [16]:
print(f'Jensen-Shannon Divergence between p and q = {compute_jensen_shannon_divergence_from_kde(kde_p, kde_q, log_fun=np.log2)}')
print(f'Jensen-Shannon Divergence between q and p = {compute_jensen_shannon_divergence_from_kde(kde_q, kde_p, log_fun=np.log2)}')

Jensen-Shannon Divergence between p and q = 0.07581454028298981
Jensen-Shannon Divergence between q and p = 0.07581454028298981
