#PCA - TSNE (Norman Vicente)

In [4]:
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from scipy.linalg import svd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

In [5]:
fashion_mnist = keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [21]:
print(train_images.shape, test_images.shape)

(60000, 28, 28) (10000, 28, 28)


In [17]:
imgs = np.concatenate((train_images, test_images), axis = 0)
labels = np.concatenate((train_labels, test_labels), axis = 0)

In [18]:
def normalize(images):
    tensor = images.reshape(-1, 28 * 28)
    return scale(tensor)   

In [19]:
scaled_images = normalize(imgs)

(70000, 784)


## Principal component analysis 

In [37]:
class PCA_model:
    
    def __init__(self,dataset, k_dimensions):     
        self.data = dataset
        self.k = k_dimensions
        self.variance = 0
        self.principal_components = 0
    
    def SVD(self): 
        cov_matrix = np.cov(self.data, rowvar = False)
        U, variance, _ = svd(cov_matrix)
        return U, variance
        
    def variance(self):    
        _, variance = self.SVD()
        variance_pc = np.sum(variance[:self.k])/ np.sum(variance)
        return variance_pc
    
    def fit(self):  
        ss, _ = self.SVD()
        self.variance = self.variance()
        U_reduced = ss[:, :self.k]
        self.principal_components = np.matmul(self.data, U_reduced)

In [39]:
PCA = PCA_model(scaled_images, 2)

In [40]:
PCA.fit()
PCA.principal_components

array([[  0.82469513, -20.86260448],
       [-17.07338982,   4.96621704],
       [  9.60003596,  12.32192784],
       ...,
       [  6.99353047,   8.48864653],
       [  2.8922854 ,  15.3764669 ],
       [ 17.53224758,  -0.12483318]])

## Cantidad de varianza preservada

In [15]:
round(PCA.variance, 2)

0.36

## TSNE

In [None]:
tsne = TSNE(n_components=2, verbose = 1)

In [None]:
t_model = PCA(n_components=5)

In [None]:
t_model.fit_transform(scaled_images)

array([[ -0.82469513,  20.86260447, -12.90604763,   3.46000512,
         -6.68513224],
       [ 17.07338982,  -4.96621704,  -2.00699145,   5.97686895,
         10.89045948],
       [ -9.60003596, -12.32192784,   1.11963294,   2.56029229,
          0.22145529],
       ...,
       [ -6.99353047,  -8.48864652,   0.39553082,   6.61110075,
         -1.26212279],
       [ -2.8922854 , -15.3764669 ,  -7.59981311,  -0.97796232,
         -1.68061699],
       [-17.53224758,   0.12483318,   3.2543249 ,  -0.76725408,
         -1.57489636]])

In [None]:
tsne.fit_transform(scaled_images)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 70000 samples in 22.345s...


## Conclusiones

El modelo de PCA fue mucho mas rapido y no supuso mayor tiempo a la hora de hacer el entrenamiento dando buenos resultados con una varianza baja por el contrario TSNE fue muy lento con mas de 2 horas y nunca termino de ejecutarse(Google Colab y Macbook Pro 2020)