# Package Imports and Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import collections
import math
import numpy as np
import scipy as sp
import statsmodels.api as sm


Bad key "text.kerning_factor" on line 4 in
/Users/michaelnowotny/anaconda3/envs/continuous_time_mcmc/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
from divergence import *

# Distributions and Samples

## Construct Artificial Sample from two Normal Distributions

This example considers two different normal distributions $p$ and $q$ with
$p = N(2, 9)$ and $q = N(1, 4)$.

In [4]:
# fix random seed for reproducibility
np.random.seed(42)

# set parameters of the normal distributions p and q
mu_p = 2
sigma_p = 3
mu_q = 1
sigma_q = 2

# draw samples from each normal distribution
n = 10000

def draw_normal(mu, sigma, n: int, antithetic: bool = False):
    z = np.random.randn(n)
    if antithetic: 
        z = np.hstack((z, -z))
    
    return mu + sigma * z

sample_p = draw_normal(mu_p, sigma_p, n=n, antithetic=True)
sample_q = draw_normal(mu_q, sigma_q, n=n, antithetic=True)

# fit a non-parametric density estimate for both distributions
kde_p = sm.nonparametric.KDEUnivariate(sample_p)
kde_q = sm.nonparametric.KDEUnivariate(sample_q)
kde_p.fit()
kde_q.fit()

# construct exact normal densities for p and q
pdf_p = lambda x: sp.stats.norm.pdf(x, mu_p, sigma_p)
pdf_q = lambda x: sp.stats.norm.pdf(x, mu_q, sigma_q)

# compute support for kernel density estimates
p_min = min(kde_p.support)
p_max = max(kde_p.support)
q_min = min(kde_q.support)
q_max = max(kde_q.support)
combined_min = min(p_min, q_min)
combined_max = max(p_max, q_max)

## Construct Sample from Multinomial Distribution

In [5]:
multinomial_sample_q = np.array([1, 2, 3, 2, 3, 3, 3, 2, 1, 1])
multinomial_sample_p = np.array([1, 2, 3, 3, 3, 3, 3, 3, 3, 3])

# Entropy

The entropy of a probability distribution $p$ is defined as 

$H(X) = - \mathbb{E}_p \left[ \log_{\text{base}} p \right]$, 

where $\mathbb{E}_P$ denotes expectation with respect the probability distribution $p$. In information theory, the base of the logarithm is 2 and the interpretation of entropy is the average number of bits needed to optimally encode the signal represented by the distribution $p$. 

Divergence defaults to $\text{base}=e$, which results in the natural logarithm i.e. $\log_e = \ln$. This default choice can be overridden via the argument 'base' during the entropy calculation. In particular, specifying $\text{base}=2$ results in the classical Shannon entropy expressed in bits, whereas specifying $\text{base}=10$ produces the entropy in decimal bits (dits or Hartleys).

## Continuous Case

### Entropy from Samples (via Statsmodels KDE Objects)

In [6]:
print(f'Entropy of p = {entropy_from_samples(sample_p, discrete=False)}')
print(f'Entropy of q = {entropy_from_samples(sample_q, discrete=False)}')

Entropy of p = 2.5311099866415234
Entropy of q = 2.123343378346464


### Entropy from Statsmodels KDE Objects (via Statsmodels)

In [7]:
print(f'Entropy of p = {kde_p.entropy}')
print(f'Entropy of q = {kde_q.entropy}')

Entropy of p = 2.531114322639585
Entropy of q = 2.1233454054445


### Entropy from Statsmodels KDE Objects (via Divergence)

In [8]:
print(f'Entropy of p = {entropy_from_kde(kde_p)}')
print(f'Entropy of q = {entropy_from_kde(kde_q)}')

Entropy of p = 2.5311099866415234
Entropy of q = 2.123343378346464


### Entropy from Normal Probability Density Functions

In [9]:
print(f'Entropy of p = {entropy_from_density_with_support(pdf_p, p_min, p_max)}')
print(f'Entropy of q = {entropy_from_density_with_support(pdf_q, q_min, q_max)}')

Entropy of p = 2.517390416962563
Entropy of q = 2.1120728461033007


### Theoretical Entropy of a Normal Distribution

In [10]:
def theoretical_entropy_of_normal_distribution(mu: float, sigma: float, log_fun: tp.Callable = np.log) -> float:
    return 0.5 * (1.0 + log_fun(2 * np.pi * sigma**2))

print(f'Entropy of p = {theoretical_entropy_of_normal_distribution(mu_p, sigma_p)}')
print(f'Entropy of q = {theoretical_entropy_of_normal_distribution(mu_q, sigma_q)}')

Entropy of p = 2.5175508218727822
Entropy of q = 2.112085713764618


## Discrete Case

In [11]:
print(f'Entropy of p = {discrete_entropy(multinomial_sample_p)}')
print(f'Entropy of q = {discrete_entropy(multinomial_sample_q)}')

Entropy of p = 0.639031859650177
Entropy of q = 1.0888999753452238


# Cross Entropy

The cross entropy of a distribution $q$ relative to a distribution $p$ is defined as  

$H_q(p) = - \mathbb{E}_p \left[ \log_{\text{base}} q \right]$.

With a base of 2, the cross-entropy of $q$ relative to $p$ is the average number of bits required to encode the signal in $p$ using a code optimized for the signal in $q$.

## Continuous Case

### Cross Entropy from Samples (via Statsmodels KDE Objects)

In [12]:
print(f'Cross Entropy of p relative to q = {cross_entropy_from_samples(sample_p, sample_q, discrete=False)}')
print(f'Cross Entropy of q relative to p = {cross_entropy_from_samples(sample_q, sample_p, discrete=False)}')

Cross Entropy of p relative to q = 2.9007913519550996
Cross Entropy of q relative to p = 2.3060943540378385


### Cross Entropy from Statsmodels KDE Objects

In [13]:
%timeit cross_entropy_from_kde(kde_p, kde_q), cross_entropy_from_kde(kde_q, kde_p)

580 ms ± 8.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
print(f'Cross Entropy of p relative to q = {cross_entropy_from_kde(kde_p, kde_q)}')
print(f'Cross Entropy of q relative to p = {cross_entropy_from_kde(kde_q, kde_p)}')

Cross Entropy of p relative to q = 2.9007913519550996
Cross Entropy of q relative to p = 2.3060943540378385


### Cross Entropy from Normal Probability Density Functions

In [15]:
print(f'Cross Entropy of p relative to q = {cross_entropy_from_densities_with_support(pdf_p, pdf_q, combined_min, combined_max)}')
print(f'Cross Entropy of q relative to p = {cross_entropy_from_densities_with_support(pdf_q, pdf_p, combined_min, combined_max)}')

Cross Entropy of p relative to q = 2.861760799072692
Cross Entropy of q relative to p = 2.295328590629144


## Discrete Case

In [16]:
print(f'Cross Entropy of p relative to q = {discrete_cross_entropy(multinomial_sample_p, multinomial_sample_q)}')
print(f'Cross Entropy of q relative to p = {discrete_cross_entropy(multinomial_sample_q, multinomial_sample_p)}')

Cross Entropy of p relative to q = 0.9738271463645112
Cross Entropy of q relative to p = 1.4708084763221114


# Relative Entropy (Kullback-Leibler Divergence)

Relative entropy or Kullback-Leibler divergence measures the dispersion of two probability distributions $P$ and $Q$. It is defined as the difference between the cross entropy of $q$ relative to $p$ and the entropy of $p$

$D_{KL} (P||Q) = \mathbb{E}_p \left[ \log_{\text{base}} \left( \frac{p}{q} \right) \right] = H_q(p) - H(p)$.

With a base of 2, it can be interpreted as the average number of additional bits required to encode the signal in $p$ using a code optimized for the signal in $q$ over and above the number of bits required by the optimal code for $p$.

## Continuous Case

### Relative Entropy from Samples (via Statsmodels KDE Objects)

In [17]:
print(f'Relative Entropy of p relative to q = {relative_entropy_from_samples(sample_p, sample_q, discrete=False)}')
print(f'Relative Entropy of q relative to p = {relative_entropy_from_samples(sample_q, sample_p, discrete=False)}')

Relative Entropy of p relative to q = 0.36968136530318224
Relative Entropy of q relative to p = 0.18274894856540888


### Relative Entropy from Statsmodels KDE Objects

In [18]:
print(f'Relative Entropy of p relative to q = {relative_entropy_from_kde(kde_p, kde_q)}')
print(f'Relative Entropy of q relative to p = {relative_entropy_from_kde(kde_q, kde_p)}')

Relative Entropy of p relative to q = 0.36968136530318224
Relative Entropy of q relative to p = 0.18274894856540888


### Relative Entropy from Normal Probability Density Functions

In [19]:
print(f'Relative Entropy from p to q = {relative_entropy_from_densities_with_support(pdf_p, pdf_q, combined_min, combined_max)}')
print(f'Relative Entropy from q to p = {relative_entropy_from_densities_with_support(pdf_q, pdf_p, combined_min, combined_max)}')

Relative Entropy from p to q = 0.3443703759461555
Relative Entropy from q to p = 0.18324289254428677


### Theoretical Relative Entropy for Normal Distributions

In [20]:
def relative_entropy_between_normal_distributions(mu_1, sigma_1, mu_2, sigma_2, log_fun: tp.Callable = np.log):
    return ((mu_1 - mu_2)**2 + sigma_1**2 - sigma_2**2 ) / (2 * sigma_2**2) + log_fun(sigma_2/sigma_1)

print(f'Relative Entropy from p to q = {relative_entropy_between_normal_distributions(mu_p, sigma_p, mu_q, sigma_q)}')
print(f'Relative Entropy from q to p = {relative_entropy_between_normal_distributions(mu_q, sigma_q, mu_p, sigma_p)}')

Relative Entropy from p to q = 0.34453489189183556
Relative Entropy from q to p = 0.18324288588594217


## Discrete Case

In [21]:
print(f'Relative Entropy of p relative to q = {discrete_relative_entropy(multinomial_sample_p, multinomial_sample_q)}')
print(f'Relative Entropy of q relative to p = {discrete_relative_entropy(multinomial_sample_q, multinomial_sample_p)}')

Relative Entropy of p relative to q = 0.3347952867143343
Relative Entropy of q relative to p = 0.3819085009768876


# Jensen-Shannon Divergence

The Jensen-Shannon divergence, a symmetric measure of the divergence of probability distributions, is defined as

$JSD(p||q) = \frac{1}{2} D_{KL} (p||m) + \frac{1}{2} D_{KL} (q||m)$, 

where $m = \frac{1}{2} \left( p + q \right)$.

For base 2, the JSD is bounded between 0 and 1. For base $e$, it is bounded between $0$ and $\ln(2)$.

## Continuous Case

### Jensen-Shannon Divergence from Samples (via Statsmodels KDE Objects)

In [22]:
print(f'Jensen-Shannon Divergence between p and q = {jensen_shannon_divergence_from_samples(sample_p, sample_q, discrete=False)}')
print(f'Jensen-Shannon Divergence between q and p = {jensen_shannon_divergence_from_samples(sample_q, sample_p, discrete=False)}')

Jensen-Shannon Divergence between p and q = 0.0525506348426028
Jensen-Shannon Divergence between q and p = 0.0525506348426028


### Jensen-Shannon Divergence from Statsmodels KDE Objects

In [23]:
print(f'Jensen-Shannon Divergence between p and q = {jensen_shannon_divergence_from_kde(kde_p, kde_q)}')
print(f'Jensen-Shannon Divergence between q and p = {jensen_shannon_divergence_from_kde(kde_q, kde_p)}')

Jensen-Shannon Divergence between p and q = 0.0525506348426028
Jensen-Shannon Divergence between q and p = 0.0525506348426028


### Jensen-Shannon Divergence from Normal Probability Density Functions

In [24]:
print(f'Jensen-Shannon Divergence between p and q = {jensen_shannon_divergence_from_densities_with_support(pdf_p, pdf_q, combined_min, combined_max)}')
print(f'Jensen-Shannon Divergence between q and p = {jensen_shannon_divergence_from_densities_with_support(pdf_q, pdf_p, combined_min, combined_max)}')

Jensen-Shannon Divergence between p and q = 0.05290044224944204
Jensen-Shannon Divergence between q and p = 0.05290044224944204


### Jensen-Shannon Divergence from Statsmodels KDE Objects in Bits

In [25]:
print(f'Jensen-Shannon Divergence between p and q = {jensen_shannon_divergence_from_kde(kde_p, kde_q, base=2.0)}')
print(f'Jensen-Shannon Divergence between q and p = {jensen_shannon_divergence_from_kde(kde_q, kde_p, base=2.0)}')

Jensen-Shannon Divergence between p and q = 0.07581454028298981
Jensen-Shannon Divergence between q and p = 0.07581454028298981


## Discrete Case

### Calculation Function Specific to Discrete Distributions

In [26]:
print(f'Jensen-Shannon Divergence between p and q = {discrete_jensen_shannon_divergence(multinomial_sample_p, multinomial_sample_q)}')
print(f'Jensen-Shannon Divergence between q and p = {discrete_jensen_shannon_divergence(multinomial_sample_q, multinomial_sample_p)}')

Jensen-Shannon Divergence between p and q = 0.0863046217355343
Jensen-Shannon Divergence between q and p = 0.0863046217355343


### Generic calculation functionality covering samples from continuous as well as discrete distributions

In [27]:
print(f'Jensen-Shannon Divergence between p and q = {jensen_shannon_divergence_from_samples(multinomial_sample_p, multinomial_sample_q, discrete=True)}')
print(f'Jensen-Shannon Divergence between q and p = {jensen_shannon_divergence_from_samples(multinomial_sample_q, multinomial_sample_p, discrete=True)}')

Jensen-Shannon Divergence between p and q = 0.0863046217355343
Jensen-Shannon Divergence between q and p = 0.0863046217355343


# Mutual Information

Mutual information is a measure of measure of mutual dependence of random variables that goes beyond linear dependence measured by correlation. It is defined as the KL-divergence between the joint density of two random variables $x$ and $y$ and the product of their marginal densities, i.e.  

$I(X; Y) = D_KL(p_{x, y}|| p_x \otimes p_y) = E_{p_{x, y}} \left[ \log_{\text{base}} \left( \frac{p_{x, y} (x, y)}{p_x(x) p_y(y)} \right) \right]$.  



## Continuous Case

### Construct Artificial Data from a Bi-Variate Normal Distribution

In [28]:
# set parameters of the normal distributions x and y
mu_x = 2
sigma_x = 3
mu_y = 1
sigma_y = 2
rho = 0.5

# draw 1000 samples from each normal distribution
n = 10000
z = np.random.randn(n)
sample_x = mu_x + sigma_x * z
sample_y = mu_y + sigma_y * (rho * z + np.sqrt(1.0 - rho**2) * np.random.randn(n))

# fit a non-parametric density estimate for both distributions
kde_x = sm.nonparametric.KDEUnivariate(sample_x)
kde_y = sm.nonparametric.KDEUnivariate(sample_y)
kde_x.fit() # Estimate the densities
kde_y.fit() # Estimate the densities
kde_xy = sp.stats.gaussian_kde([sample_x, sample_y])

# construct exact normal densities for x and y
pdf_x = lambda x: sp.stats.norm.pdf(x, mu_x, sigma_x)
pdf_y = lambda y: sp.stats.norm.pdf(y, mu_y, sigma_y)
pdf_xy = sp.stats.multivariate_normal(mean=[mu_x, mu_y], cov=[[sigma_x**2, rho * sigma_x * sigma_y], [rho * sigma_x * sigma_y, sigma_y**2]]).pdf

# # compute support for kernel density estimates
x_min = min(kde_x.support)
x_max = max(kde_x.support)
y_min = min(kde_y.support)
y_max = max(kde_y.support)

### Mutual Information from Samples (via Statsmodels KDE Objects)

In [29]:
print(f'Mutual Information of x and y = {mutual_information_from_samples(sample_x, sample_y)}')

Mutual Information of x and y = 0.13960557811247337


### Mutual Information from Statsmodels KDE Objects

In [30]:
print(f'Mutual Information of x and y = {mutual_information_from_kde(kde_x, kde_y, kde_xy)}')

Mutual Information of x and y = 0.13960557811247337


### Mutual Information from Normal Probability Density Functions

In [31]:
print(f'Mutual Information of x and y = {mutual_information_from_densities_with_support(pdf_x, pdf_y, pdf_xy, x_min=-20, x_max=20, y_min=-20, y_max=20)}')

Mutual Information of x and y = 0.1438410315263198


### Theoretical Mutual Information of Bi-Variate Normal Distributions

In [32]:
def mutual_information_for_bivariate_normal_distribution(rho: float, 
                                                         log_fun: tp.Callable = np.log) -> float:
    return - 0.5 * np.log(1.0 - rho**2)

print(f'Mutual Information of x and y = {mutual_information_for_bivariate_normal_distribution(rho)}')

Mutual Information of x and y = 0.14384103622589045


## Discrete Case

Construct two discrete samples

In [33]:
discrete_sample_x = np.array([1, 1, 3, 1, 2, 3])
discrete_sample_y = np.array([1, 1, 1, 3, 2, 1])

In [34]:
print(f'The mutual information of x and y is {discrete_mutual_information(discrete_sample_x, discrete_sample_y, base=np.e)}')

The mutual information of x and y is 0.5493061443340548


Mutual Information is symmetric

In [35]:
print(f'The mutual information of y and x is {discrete_mutual_information(discrete_sample_y, discrete_sample_x, base=np.e)}')

The mutual information of y and x is 0.5493061443340548


# Joint Entropy

The joint entropy of the random variables x and y with joint density $p_{x, y}$ is defined as  

$H(X, Y) = - E_{p_{x, y}} \left[ \log_{\text{base}} p_{x, y} (x, y) \right]$.

Joint entropy is symmetric, i.e.  

$H(X, Y) = H(Y, X)$.

## Continuous Case

In [36]:
joint_entropy_of_x_and_y = joint_entropy_from_samples(sample_x, sample_y)
print(f'Joint entropy of x and y = {joint_entropy_of_x_and_y}')

Joint entropy of x and y = 4.475745990655821


## Discrete Case

In [37]:
print(f'The joint entropy of x and y is {discrete_joint_entropy(discrete_sample_x, discrete_sample_y, base=np.e)}')

The joint entropy of x and y is 1.3296613488547582


# Conditional Entropy

The conditional entropy of the random variable y given x with joint density $p_{x, y}$ and marginal density $p_x$ of $x$ is defined as  

$H(Y|X) = - E_{p_{x, y}} \left[ \log \frac{p_{x, y} (x, y)}{p_x(x)} \right]$.  

From this definition follows the change rule for conditional entropy


$H(X, Y) = H(X) + H(Y|X)$.

Switching the roles of $x$ and $y$ and using the symmetry of joint entropy, we obtain  

$H(X, Y) = H(Y) + H(X|Y)$.

Substracting second equation for joint entropyfrom the first and rearranging yields  

$H(Y) - H(Y|X) = H(X) - H(X|Y)$.

## Continuous Case

In [38]:
conditional_entropy_of_y_given_x = conditional_entropy_from_samples(sample_x, sample_y)
print(f'Conditional entropy of y given x = {conditional_entropy_of_y_given_x}')

Conditional entropy of y given x = 1.9814143973881457


In [39]:
conditional_entropy_of_x_given_y = conditional_entropy_from_samples(sample_y, sample_x)
print(f'Conditional entropy of x given y = {conditional_entropy_of_x_given_y}')

Conditional entropy of x given y = 2.3593845496162107


Check whether the chain rule of conditional entropy is satisfied

In [40]:
np.isclose(entropy_from_samples(sample_x) + conditional_entropy_of_y_given_x, joint_entropy_of_x_and_y, rtol=1e-2, atol=1e-3)

True

In [41]:
np.isclose(entropy_from_samples(sample_y) + conditional_entropy_of_x_given_y, joint_entropy_of_x_and_y, rtol=1e-2, atol=1e-3)

True

## Discrete Case

In [42]:
print(f'The conditional entropy of y given x is {discrete_conditional_entropy_of_y_given_x(discrete_sample_x, discrete_sample_y, base=np.e)}')

The conditional entropy of y given x is 0.31825708414740644


We can verify the chain rule for conditional entropy:

In [43]:
np.isclose(discrete_entropy(discrete_sample_y) + discrete_conditional_entropy_of_y_given_x(discrete_sample_y, discrete_sample_x), discrete_joint_entropy(discrete_sample_x, discrete_sample_y))

True

In [44]:
np.isclose(discrete_entropy(discrete_sample_x) + discrete_conditional_entropy_of_y_given_x(discrete_sample_x, discrete_sample_y), discrete_joint_entropy(discrete_sample_y, discrete_sample_x))

True