In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Análise de componentes principais

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.int64)

X = mnist['data']
y = mnist['target']

In [None]:
X_train = X[:60000].astype('float64').values
y_train = y[:60000]

X_test = X[60000:].astype('float64').values
y_test = y[60000:]

In [None]:
plt.figure(figsize=(12, 12))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    plt.imshow(
        X_train[6000 * i, :].reshape(28, 28),
        cmap=matplotlib.cm.binary,
        interpolation='nearest',
    )
plt.show()

imagem media

In [None]:
X_train.shape

In [None]:
x0 = X_train.mean(axis=0)
plt.imshow(x0.reshape(28, 28),
           cmap=matplotlib.cm.binary,
           interpolation='nearest')
plt.show()

Removendo a média

In [None]:
X_train_residual = X_train.copy()
for i in range(X_train_residual.shape[0]):
    X_train_residual[i, :] -= x0

plt.figure(figsize=(12, 12))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    plt.imshow(X_train_residual[6000 * i, :].reshape(28, 28),
               cmap=matplotlib.cm.binary,
               interpolation='nearest')
plt.show()

Usando SVD para obter as componentes principais:

In [None]:
X_train_residual.shape

In [None]:
U, s, Vt = np.linalg.svd(X_train_residual[::10, :])

In [None]:
plt.plot(s)

In [None]:
s.shape

In [None]:
print(s[:10])

In [None]:
plt.figure(figsize=(12, 12))
for i in range(16):
    plt.subplot(4, 4, i + 1)
    aux = Vt[i, :] / max(Vt[i, :])
    plt.imshow(aux.reshape(28, 28),
               cmap=matplotlib.cm.binary,
               interpolation='nearest')
plt.show()

In [None]:
v = Vt[0, :]
v.dot(v)

Removendo a média e a primeira componente:

In [None]:
v0 = Vt[0, :]

X_train_residual = X_train.copy()

for i in range(X_train_residual.shape[0]):
    X_train_residual[i, :] -= x0
    X_train_residual[i, :] -= X_train_residual[i, :].dot(v0) * v0

plt.figure(figsize=(12, 12))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    plt.imshow(X_train_residual[6000 * i, :].reshape(28, 28),
               cmap=matplotlib.cm.binary,
               interpolation='nearest')
plt.show()

Removendo a média, a primeira componente, e a segunda componente:

In [None]:
v1 = Vt[1, :]

X_train_residual = X_train.copy()

for i in range(X_train_residual.shape[0]):
    X_train_residual[i, :] -= x0
    X_train_residual[i, :] -= X_train_residual[i, :].dot(v0) * v0
    X_train_residual[i, :] -= X_train_residual[i, :].dot(v1) * v1

plt.figure(figsize=(12, 12))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    plt.imshow(X_train_residual[6000 * i, :].reshape(28, 28),
               cmap=matplotlib.cm.binary,
               interpolation='nearest')
plt.show()

Removendo a média e as 150 primeiras componentes (pode demorar alguns minutos):

In [None]:
%%time
X_train_residual = X_train.copy()
num_components = 150

for i in range(X_train_residual.shape[0]):
    X_train_residual[i, :] -= x0
    for j in range(num_components):
        v = Vt[j, :]
        X_train_residual[i, :] -= X_train_residual[i, :].dot(v) * v

plt.figure(figsize=(12, 12))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    plt.imshow(X_train_residual[6000 * i, :].reshape(28, 28),
               cmap=matplotlib.cm.binary,
               interpolation='nearest')
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    plt.imshow((X_train - X_train_residual)[6000 * i, :].reshape(28, 28),
               cmap=matplotlib.cm.binary,
               interpolation='nearest')
plt.show()

**Atividade**

Exercício 9 do livro texto