In [196]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import linalg


# --Dot product and Euclidean norm--
print('--Dot product and Euclidean norm--')

# Given arrays
a = np.array([1, 2])
b = np.array([1, 1])

# Function euclidean(x) that computes the Euclidean norm of vector x
euclidean = lambda x: np.sqrt(np.dot(x, x))

# Compute the Euclidean norm of a
eu_a = euclidean(a)
print('Euclidean norm of a:', eu_a)

# Compute the euclidean distance of ∥a−b∥
eu_ab = euclidean(a-b)
print('Euclidean distance of ∥a−b∥:', eu_ab)

# Compute the projection of b in the direction of vector a
projection_ba = np.dot(b, a/euclidean(a))
print('Projection of vector b in the direction of vector a:', projection_ba)

# Simulate a dataset X of N=100 samples of 2-dimensional vectors
N = 100
X = np.random.randint(20, size=N*2).reshape(N, 2)

# Project all samples(from X) in the direction of the vector a
pj = np.dot(X, a/euclidean(a))
    # print(pj)


# --Covariance matrix and Mahalanobis form--
print('\n\n--Covariance matrix and Mahalanobis form--')

# Sample a dataset of samples of 2-dimensional vectors from the bivariate normal distribution
mean = np.array([1, 1])
cov = np.array([[1, 0.8], [0.8, 1]])
X = np.random.multivariate_normal(mean, cov, N)
    # print(X)

# Compute the mean vector x_bar and center X. Compare the estimated mean x to the true mean, μ
x_bar = np.mean(X, axis=0)
X_centered = (X - x_bar)
print(f'True mean mu: {mean}\nEstimated mean x: {x_bar}')

# Compute the empirical covariance matrix S. Compare the estimated covariance matrix S to the true covariance matrix, Σ
S = 1/(N-1) * np.dot(X_centered.T, X_centered)
print(f'\nTrue covariance matrix Cov:\n{Cov}\nEstimated covariance matrix S:\n{S}')

# Compute (Sinv) the inverse of the covariance matrix S
S_inv = linalg.inv(S)
print('\nInverse of S:\n', S_inv)

# Function mahalanobis(x, x_bar, S_inv) that computes the Mahalanobis distance of a vector x to the mean, x_bar
def mahalanobis(X, x_bar, S_inv):
    X_centered = X - x_bar
    mah = np.sqrt(np.dot(np.dot(X_centered, S_inv), X_centered))
    return mah

# Compute the Mahalanobis and Euclidean distances of each sample x_i to the mean x_bar. Store the results in a 100×2 dataframe.
df = pd.DataFrame()
mah = [mahalanobis(X[i], x_bar, S_inv) for i in range(N)]
euc = [euclidean(X_centered[i]) for i in range(N)]
df = df.assign(Mahalanobis=mah, Euclidean=euc)
df.head()

--Dot product and Euclidean norm--
Euclidean norm of a: 2.23606797749979
Euclidean distance of ∥a−b∥: 1.0
Projection of vector b in the direction of vector a: 1.3416407864998738


--Covariance matrix and Mahalanobis form--
True mean mu: [1 1]
Estimated mean x: [0.9764894  0.91729081]

True covariance matrix Cov:
[[1.  0.8]
 [0.8 1. ]]
Estimated covariance matrix S:
[[0.88356694 0.77289468]
 [0.77289468 1.05189731]]

Inverse of S:
 [[ 3.16783583 -2.32760692]
 [-2.32760692  2.66090138]]


Unnamed: 0,Mahalanobis,Euclidean
0,2.960481,3.44899
1,1.290932,0.573125
2,1.369869,0.737342
3,2.317623,3.03316
4,0.646596,0.70979
