# Principal Component Analysis

$X = (\mathbf{x}_1, \mathbf{x}_2, ..., \mathbf{x}_n)$ is a collection of vectors encoding some information.

We want to find a direction in space along which data vary the most.

Firstly, center the data:

$$\overline{\mathbf{x}}_i = \mathbf{x}_i - \frac{1}{n}\sum_{k=1}^n{\mathbf{x}_k}$$

Given a direction $\mathbf{v}$ (i.e. unit vector) the projection of $\overline{\mathbf{x}}_i$ on it will be $\overline{\mathbf{x}}_i^T \mathbf{v}$.

We can project all centered points with one matrix multiplication: $\mathbf{p} = \overline{X}^T \mathbf{v}$.

$\mathbf{p}$ is the resulting vector of projections.

Our goal is to find unit $\mathbf{v}$ which maximizes $Var[\mathbf{p}] = \frac{1}{n}\sum_{i=0}^n{(p_i - E[\mathbf{p}])^2}$.

Since data is centered, $E[\mathbf{p}] = 0$. Hence 

$$Var[\mathbf{p}] = \frac{1}{n}\sum_{i=0}^n{p_i^2} = \frac{1}{n}\mathbf{p}^T \mathbf{p} = \frac{1}{n} \mathbf{v}^T \overline{X} \overline{X}^T \mathbf{v}$$

We already know how to maximize this expression! The answer is the eigenvector of $\frac{1}{n} \overline{X} \overline{X}^T$ which corresponds to the greatest eigenvalue. Moreover, if $\mathbf{v}$ is an eigenvector:

$$Var[\mathbf{p}] = \mathbf{v}^T \frac{1}{n} \overline{X} \overline{X}^T \mathbf{v} = \lambda \mathbf{v}^T \mathbf{v} = \lambda$$


# Toy example: Artificial Data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
%matplotlib notebook
plt.rcParams['figure.dpi'] = 150

# pca takes centered data matrix
def pca(data):
    covariance_matrix = (data.transpose() @ data) / data.shape[0]
    
    eigvals, eigvecs = np.linalg.eig(covariance_matrix)
    
    sorted_idx = np.argsort(eigvals)[::-1]
    
    return eigvals[sorted_idx], eigvecs[:,sorted_idx]

In [None]:
# Number of data points
N = 100

# Data dimension
D = 3

# Generate data
data = np.random.normal(size=(N, D))
center = np.mean(data, axis=0)
data = data - center

eigvals, eigvecs = pca(data)
# Eigenvalues are real and positive
print(eigvals)

# Eigenvectors are orthogonal
products = eigvecs.transpose() @ eigvecs
print(products)

In [None]:
fig = plt.figure()
ax = plt.axes(projection ='3d')
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.set_zlim(-3, 3)

ax.scatter(data[:, 0], data[:, 1], data[:, 2])
for v in np.transpose(eigvecs):
    ax.plot3D([0, v[0]*3], [0, v[1]*3], [0, v[2]*3])

In [None]:
# Number of data points
N = 100

# Random Embedding Matrix
embedding = np.random.normal(size=(2, 3))

# Generate and center data
data = np.random.normal(size=(N, 2)) @ embedding + np.random.normal(size=(N, 3))*0.1
center = np.mean(data, axis=0)
data = data - center

eigvals, eigvecs = pca(data)
print(f'Eigenvalues are {eigvals}')

In [None]:
fig = plt.figure()
ax = plt.axes(projection ='3d')
ax.set_xlim(-3, 3)
ax.set_ylim(-3, 3)
ax.set_zlim(-3, 3)

ax.scatter(data[:, 0], data[:, 1], data[:, 2])
for v in np.transpose(eigvecs):
    ax.plot3D([0, v[0]*3], [0, v[1]*3], [0, v[2]*3])

In [None]:
# Transform data into eigenvectors coordinate system
transformed_data = data @ eigvecs
print(f'Variance along first eigenvector: {np.var(transformed_data[:, 0])}')
print(f'Variance along second eigenvector: {np.var(transformed_data[:, 1])}')
print(f'Veriance along third eigenvector: {np.var(transformed_data[:, 2])}')

print(f'Eigenvalues: {eigvals}')

In [None]:
# Neglect the dimension which corresponds to the lowest eigenvalue
transformed_data[:, 2] = 0.

# Transform to the original coordinate system
transformed_back = transformed_data @ eigvecs.transpose()

average_squared_distance = np.sum((data - transformed_back)**2) / N
print(f'Average Squared Distance from original data = {average_squared_distance}')

# EigenFaces

In [None]:
import pandas as pd

df = pd.read_csv('face_data.csv')

# 400 Samples, 64x64 images in shades of gray + 1 target label = 4097
df.shape

In [None]:
def plot_faces(pixels):
    fig, axes = plt.subplots(5, 5, figsize=(6, 6))
    for i, ax in enumerate(axes.flat):
        ax.imshow(np.array(pixels)[i].reshape(64, 64), cmap='gray')
    plt.show()
    
X = df.drop('target', axis=1)
y = df['target']

plot_faces(X)

In [None]:
# Bring the data to the format for our pca function
data = np.array(X)

# Center the data
center = np.mean(data, axis=0)

centered_data = data - center

In [None]:
# Compute PCA.
eigvals, eigvecs = pca(centered_data)
eigvals, eigvecs = np.real(eigvals), np.real(eigvecs)

fig = plt.figure()
plt.plot(eigvals[:50])
plt.show()

In [None]:
# Plot eigenfaces
eigenfaces = (eigvecs + center).transpose()

plot_faces(eigenfaces)

In [None]:
# Project to first 25 eigenvectors
transformed_data = centered_data @ eigvecs[:, :25]

print(f'Shape of the transformed data is {transformed_data.shape}')

transformed_back = transformed_data @ eigvecs[:, :25].transpose() + center

plot_faces(transformed_back)

# Image Compression

In [None]:
import matplotlib.image as mpimg

img = mpimg.imread('cat.png')

print(img.shape) 

fig = plt.figure()
plt.imshow(img)

In [None]:
# Treat every column as a vector
data = np.reshape(img, (900, 900*4)) 
print(data.shape) 

# Center the data
center = np.mean(data, axis=0)

centered_data = data - center

# Compute PCA.
eigvals, eigvecs = pca(centered_data)
eigvals, eigvecs = np.real(eigvals), np.real(eigvecs)

fig = plt.figure()
plt.plot(eigvals[:50])
plt.show()

In [None]:
# Project to first 30 eigenvectors
transformed_data = centered_data @ eigvecs[:, :30]

# shape is (50, 900). Compression rate is 72!
print(f'Shape of the transformed data is {transformed_data.shape}')

transformed_back = transformed_data @ eigvecs[:, :30].transpose() + center

# reshape the transformed back data
rec_img = np.reshape(transformed_back, (900, 900, 4))

fig = plt.figure()
plt.imshow(rec_img)

# Gram Matrix 

$X = (\mathbf{x}_1, \mathbf{x}_2, ..., \mathbf{x}_n)$ is a collection of vectors encoding some information.

Let $\mathbf{v}$ be an eigenvector of covariance matrix. That is

$$\frac{1}{n} \overline{X} \overline{X}^T \mathbf{v} = \lambda \mathbf{v}$$

Multiply by $\overline{X}^T$ from the left:

$$\frac{1}{n} \overline{X}^T \overline{X} \overline{X}^T \mathbf{v} = \lambda \overline{X}^T \mathbf{v}$$

The matrix of pair-wise scalar products $\overline{X}^T \overline{X}$ is called *Gram Matrix*. Denote it $K$:

$$ K \overline{X}^T \mathbf{v} = n \lambda \overline{X}^T \mathbf{v}$$

$\overline{X}^T \mathbf{v}$ is an eigenvector for that matrix! Remember it was a vector of projections of data points on the eigenvector. However numerical computation will give us this vector normilized. To get the projection vector we need to multiply it by the norm of $\overline{X}^T \mathbf{v}$:

$$ \lVert \overline{X}^T \mathbf{v} \rVert = \sqrt{\mathbf{v}^T \overline{X} \overline{X}^T \mathbf{v}} = \sqrt{n \lambda} $$

In [None]:
# pca takes centered data matrix
def pca2(data):
    gram_matrix = (data @ data.transpose()) / data.shape[0]
    
    eigvals, eigvecs = np.linalg.eig(gram_matrix)
    
    sorted_idx = np.argsort(eigvals)[::-1]
    
    return eigvals[sorted_idx], eigvecs[:,sorted_idx]

In [None]:
# Number of data points
N = 10

# Random Embedding Matrix
embedding = np.random.normal(size=(2, 3))

# Generate data
data = np.random.normal(size=(N, 2)) @ embedding + np.random.normal(size=(N, 3))*0.1

eigvals, eigvecs = pca(data)
print(f'Eigenvalues of covariance matrix are {eigvals}')

eigvals2, eigvecs2 = pca2(data)
eigvals2, eigvecs2 = np.real(eigvals2), np.real(eigvecs2)
print(f'First 3 eigenvalues of gram matrix are {eigvals2[:3]}')


In [None]:
projection1 = eigvecs2[:, 1] * np.sqrt(eigvals2[1] * data.shape[0])
projection2 = data @ eigvecs[:, 1]

In [None]:
print(f"projection from eigendecomposition of covariance matrix: \n{projection1}")
print(f"projection from eigendecomposition of gram matrix: \n{projection2}")

# Example: Iris dataset

In [None]:
from sklearn import datasets

data = datasets.load_iris()

print(data.shape)