# Geometric interpretation of PCA


In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(0)

Define the unit vector
$\mathbf{z}_1 = \begin{pmatrix} \cos(\theta_1) \\ \sin(\theta_1)\end{pmatrix}$
where $\theta_1 = \pi/6$ and a unit vector $\mathbf{z}_2 \perp \mathbf{z}_1$.


In [None]:
theta1 = np.pi / 6
theta2 = theta1 + np.pi / 2

z1 = np.array((np.cos(theta1), np.sin(theta1)))
z2 = np.array((np.cos(theta2), np.sin(theta2)))

Define the point $\mathbf{b} = \begin{pmatrix} 20 \\ 30\end{pmatrix}$


In [None]:
b = np.array((20, 30))

Generate $n = 1000$ points $\mathbf{x}_i$ according to the formula:

$\mathbf{x}_i = A \mathbf{r}_i + \mathbf{b}$

where $A$ is a transformation matrix with columns $[\rho_1 \mathbf{z}_1, \rho_2 \mathbf{z}_2] $ and the $\mathbf{r}_{i}$ are random vectors with independently generated componenst according to a normal distribution.


In [None]:
rho1 = 12.0
rho2 = 3.0
n_points = 1000

seeds = np.random.randn(2, n_points)
X = np.column_stack((rho1 * z1, rho2 * z2)) @ seeds + b[:, None]

X.shape

Plot the generated numbers in a plane, showing the directions $\mathbf{v}_1$ and $\mathbf{v}_2$, rescaled by $\rho_1$ and $\rho_2$, respectivaly.

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1)
ax.scatter(X[0, :], X[1, :])
ax.arrow(
    b[0] - z1[0] * rho1,
    b[1] - z1[1] * rho1,
    2 * z1[0] * rho1,
    2 * z1[1] * rho1,
    color="black",
)
ax.arrow(
    b[0] - z2[0] * rho2,
    b[1] - z2[1] * rho2,
    2 * z2[0] * rho2,
    2 * z2[1] * rho2,
    color="black",
)
ax.set_aspect("equal")

Perform PCA on the matrix $X$:

$\boldsymbol{\mu} = \frac{1}{n} \sum_{i=1}^{n} \mathbf{x}_i$

$\overline{X} = X - \boldsymbol{\mu} [1,\dots,1]$

$\overline{X} = U \Sigma V^T$


In [None]:
# SOLUTION-BEGIN
X_mean = np.mean(X, axis=1)
U, s, VT = np.linalg.svd(X - X_mean[:, None], full_matrices=False)
# SOLUTION-END

Plot the first two singular vectors, rescaled by the root of the sample variance:

${\sqrt{\frac{\sigma_1^2}{(n - 1)}}}{\mathbf{u}_1}, {\sqrt{\frac{\sigma_2^2}{(n - 1)}}}{\mathbf{u}_2}$


In [None]:
# SOLUTION-BEGIN
u1 = U[:, 0]
u2 = U[:, 1]

r = s / np.sqrt(n_points - 1)

fig, ax = plt.subplots(nrows=1, ncols=1)
ax.scatter(X[0, :], X[1, :])
plt.arrow(
    X_mean[0] - u1[0] * r[0],
    X_mean[1] - u1[1] * r[0],
    2 * u1[0] * r[0],
    2 * u1[1] * r[0],
    color="red",
)
plt.arrow(
    X_mean[0] - u2[0] * r[1],
    X_mean[1] - u2[1] * r[1],
    2 * u2[0] * r[1],
    2 * u2[1] * r[1],
    color="red",
)
ax.set_aspect("equal")
# SOLUTION-END

Print and compare the real transformation directions ($\mathbf{z}_1, \mathbf{z}_2$) and the estimated ones ($\mathbf{u}_1, \mathbf{u}_2$)

In [None]:
# SOLUTION-BEGIN
# real transformation directions
print(z1, z2)
# estimated transformation directions
print(u1, u2)
# SOLUTION-END

Compute the principal components of the data:

$\Phi = U^T \overline{X}$


In [None]:
# SOLUTION-BEGIN
Phi = np.matmul(U.transpose(), X - X_mean[:, None])
# SOLUTION-END

Make a scatterplot of the two first principal components ($\phi_1$, $\phi_2$), rescaled by the root of the sample variance


In [None]:
# SOLUTION-BEGIN
fig, ax = plt.subplots(nrows=1, ncols=1)
ax.scatter(Phi[0, :] / r[0], Phi[1, :] / r[1])
ax.set_aspect("equal")
# SOLUTION-END