In [None]:
%matplotlib inline
import math
import numpy as np
import matplotlib.pyplot as plt

The Bhattacharyya distance is a measure of similitude between two distributions. It is based on the Bhattacharyya coefficient which is a measure of the amount of overlap between two samples or populations.

In [None]:
np.random.seed(0)

In [None]:
mean_1 = np.array([2, 4])
cov_1 = np.array([[2, 2], [2, 5]])
sample_1 = np.random.multivariate_normal(mean_1, cov_1, 100)

mean_2 = np.array([4, 5])
cov_2 = np.array([[1, -1], [-1, 4]])
sample_2 = np.random.multivariate_normal(mean_2, cov_2, 100)

mean_3 = np.array([1, 9])
cov_3 = np.array([[1, 1.3], [1.3, 2]])
sample_3 = np.random.multivariate_normal(mean_3, cov_3, 100)

In [None]:
fig, ax = plt.subplots()
ax.scatter(sample_1[:, 0], sample_1[:, 1], color = 'b', alpha = 0.2)
ax.scatter(sample_2[:, 0], sample_2[:, 1], color = 'r', alpha = 0.2)
ax.scatter(sample_3[:, 0], sample_3[:, 1], color = 'k', alpha = 0.2)
plt.grid(True)

In the case of normally distributed multivariate data, like in our example, we can easily calculate this distance based on the mean vectors and the covariance matrices:

In [None]:
def bhattacharyya_normal(mean_1, mean_2, cov_1, cov_2):
    cov = (cov_1 + cov_2) / 2
    dif = mean_1 - mean_2
    return 1/8 * np.dot(np.dot(dif.T, np.linalg.inv(cov)), dif) + \
           0.5 * math.log(np.linalg.det(cov) / math.sqrt(np.linalg.det(cov_1) * np.linalg.det(cov_2)))

print('Bhattacharyya distance between distributions 1 and 2: ' + str(bhattacharyya_normal(mean_1, mean_2, cov_1, cov_2)))
print('Bhattacharyya distance between distributions 1 and 3: ' + str(bhattacharyya_normal(mean_1, mean_3, cov_1, cov_3)))
print('Bhattacharyya distance between distributions 2 and 3: ' + str(bhattacharyya_normal(mean_3, mean_2, cov_3, cov_2)))

In [None]:
from scipy.stats import multivariate_normal
discrete = np.arange(-20, 20, 1)
dist_1 = multivariate_normal(mean = mean_1, cov = cov_1)
dist_2 = multivariate_normal(mean = mean_2, cov = cov_2)

In [None]:
x, y = np.meshgrid(discrete, discrete)

In [None]:
def batt_vec(x, y):
    return np.sqrt(dist_1.pdf([x, y]) * dist_2.pdf([x, y]))

fv = np.vectorize(batt_vec)
-math.log(np.sum(np.sum(fv(x, y))))

In [None]:
bc = 0
for i in range(x.shape[0]):
    for j in range(x.shape[1]):
        bc = bc + math.sqrt(dist_1.pdf([x[i, j], y[i, j]]) * dist_2.pdf([x[i, j], y[i, j]]))
-math.log(bc)