# Distance metrics between set elements

$$
\begin{align*}
& T = \{(t_i) | t_i = (x_{i1}, x_{i2}, ... , x_{im}), i=1,...,N\}, \quad x_{ik} \in \mathbb{R} \\
& \\
& \text{Euclidean distance:} \quad D_2(t_p, t_q) = \sqrt{\sum_{k=1}^m (x_{pk} - x_{qk})^2} \\
& \text{Hamming distance:} \quad D_H(t_p, t_q) = \sum_{k=1}^m |x_{pk} - x_{qk}| \\
& \text{Chebyshev distance:} \quad D_\infty(t_p, t_q) = \max_{k=1,...,m} |x_{pk} - x_{qk}| \\
& \text{Peak distance:} \quad D_P(t_p, t_q) = \frac1m \sum_{k=1}^m \frac{|x_{pk} - x_{qk}|}{x_{pk} + x_{qk}} \\
& \text{Mahalanobis distance:} \quad D_M(t_p, t_q) = \sqrt{(t_p - t_q)^T S^{-1} (t_p - t_q)}, \quad S - \text{covariance matrix}
\end{align*}
$$

In [1]:
import numpy as np
import math

Functions for calculating distances between examples p and q:

Euclidean distance

In [2]:
def euclidean_distance(p, q):
    distance = 0
    sum = 0
    for k in range(p.shape[0]):
        sum += (p[k] - q[k]) ** 2
    distance = math.sqrt(sum)
    return distance

Hamming distance

In [3]:
def hamming_distance(p, q):
    distance = 0
    sum = 0
    for k in range(p.shape[0]):
        sum += abs(p[k] - q[k])
    distance = sum
    return distance

Chebyshov distance

In [4]:
def chebyshev_distance(p, q):
    distance = 0
    max = 0
    for k in range(p.shape[0]):
        if abs(p[k] - q[k]) > max:
            max = abs(p[k] - q[k])
    distance = max
    return distance

Peak distance

In [5]:
def peak_distance(p, q):
    distance = 0
    sum = 0
    for k in range(p.shape[0]):
        sum += abs(p[k] - q[k]) / (p[k] + q[k])
    distance = sum / p.shape[0]
    return distance

Mahalanobis distance

In [6]:
def mahalanobis_distance(p, q):
    difference = p - q
    S = np.cov(np.column_stack([p, q]))
    distance = math.sqrt(difference.dot(S).dot(difference))
    return distance

Creation of an array T of size 4x3 with signs - integers generated randomly in the interval [0, 99]

In [7]:
size = [4, 3]
T = np.random.randint(0, 100, size)
T

array([[61, 64, 42],
       [92, 30,  8],
       [87, 37, 86],
       [20, 37, 68]])

Creating 5 4x4 arrays of zeros. For each pair of points, 5 distances are calculated and entered into the corresponding matrices.

In [8]:
D2 = np.zeros((size[0], size[0]))
DH = np.zeros((size[0], size[0]))
DC = np.zeros((size[0], size[0]))
DP = np.zeros((size[0], size[0]))
DM = np.zeros((size[0], size[0]))

for i in range(size[0]):
    for j in range(size[0]):
        D2[i, j] = euclidean_distance(T[i], T[j])
        DH[i, j] = hamming_distance(T[i], T[j])
        DC[i, j] = chebyshev_distance(T[i], T[j])
        DP[i, j] = peak_distance(T[i], T[j])
        DM[i, j] = mahalanobis_distance(T[i], T[j])

The obtained distance matrices:

In [9]:
D2

array([[ 0.        , 57.21013896, 57.80138407, 55.55177765],
       [57.21013896,  0.        , 78.47292527, 93.9840412 ],
       [57.80138407, 78.47292527,  0.        , 69.37578828],
       [55.55177765, 93.9840412 , 69.37578828,  0.        ]])

In [10]:
DH

array([[  0.,  99.,  97.,  94.],
       [ 99.,   0.,  90., 139.],
       [ 97.,  90.,   0.,  85.],
       [ 94., 139.,  85.,   0.]])

In [11]:
DC

array([[ 0., 34., 44., 41.],
       [34.,  0., 78., 72.],
       [44., 78.,  0., 67.],
       [41., 72., 67.,  0.]])

In [12]:
DP

array([[0.        , 0.41477217, 0.2622508 , 0.33662107],
       [0.41477217, 0.        , 0.3207326 , 0.51226948],
       [0.2622508 , 0.3207326 , 0.        , 0.24768378],
       [0.33662107, 0.51226948, 0.24768378, 0.        ]])

In [13]:
DM

array([[   0.        , 2314.36049482, 2362.44375594, 2182.13152674],
       [2314.36049482,    0.        , 4354.36355855, 6245.87419822],
       [2362.44375594, 4354.36355855,    0.        , 3403.30493785],
       [2182.13152674, 6245.87419822, 3403.30493785,    0.        ]])