In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_openml

In [2]:
mnist = fetch_openml('mnist_784')
mnist.data.shape

(70000, 784)

In [3]:
mnist.data.head(5)

Unnamed: 0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,pixel10,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
from keras.datasets import mnist
# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()
# reshape data
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

In [5]:
# Instantiate Standard Scaler
scaler = StandardScaler()
# Fit & transform data
scaled_X_train = scaler.fit_transform(X_train)

In [6]:
pca = PCA()
X_train_pca = pca.fit_transform(scaled_X_train)

In [7]:
total_explained_variance = pca.explained_variance_ratio_.cumsum()
n_over_95 = len(total_explained_variance[total_explained_variance >= 0.95])
n_to_reach_95 = X_train_pca.shape[1] - n_over_95 + 1
print("Number features: {}\tTotal Variance Explained: {}".format(n_to_reach_95,total_explained_variance[n_to_reach_95-1]))

Number features: 331	Total Variance Explained: 0.9502951572319144


In [8]:
# In pipeline instanstiate StandardScaler and pass number of principal components to reach 95%
pipe = Pipeline([('scaler', StandardScaler()),('pca',PCA(n_components=n_to_reach_95)),('clf',RandomForestClassifier())])
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('pca', PCA(n_components=331)),
                ('clf', RandomForestClassifier())])

In [9]:
print('Training accuracy:', pipe.score(X_train, y_train))

Training accuracy: 1.0


In [10]:
print('Testing accuracy:', pipe.score(X_test, y_test))

Testing accuracy: 0.9387
