# Objective:

### Use MNIST dataset and apply PCA to find out the impact on the model training time and also model performance
### The work is taken from https://github.com/mGalarnyk/Python_Tutorials/blob/master/Sklearn/PCA/PCA_to_Speed-up_Machine_Learning_Algorithms.ipynb

In [1]:
# Setup
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml


#Download and Load the Data


In [2]:
from sklearn.datasets import fetch_openml
#get MNIST data
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings

X, y = mnist["data"], mnist["target"]

  warn(


# Split data into train/test

In [3]:
# Write a code to split your dataset into 80/20 dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)


# View Data Dimension

In [4]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((56000, 784), (14000, 784), (56000,), (14000,))

#Standardizing the Data¶

Since PCA yields a feature subspace that maximizes the variance along the axes, it makes sense to standardize the data, especially, if it was measured on different scales.

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual feature do not more or less look like standard normally distributed data

Notebook going over the importance of feature Scaling: http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html#sphx-glr-auto-examples-preprocessing-plot-scaling-importance-py


In [5]:
from sklearn.preprocessing import StandardScaler
#define scaler
scaler = StandardScaler()

# Fit on training set only.
scaler.fit(X_train)

# Apply transform to both the training set and the test set.
X_train = scaler.transform(X_train)
X_test =  scaler.transform(X_test)

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((56000, 784), (14000, 784), (56000,), (14000,))

In [6]:
# In case you want to see how the scaled number would look like, you can uncomment below lines
from scipy.stats import describe
describe(X_train)[1]

(array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -0.00457315, -0.00595681, -0.00422581,
        -0.00422581,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -0.00422581, -0.00422581, -0.00707984,
        -0.01027051, -0.01361367, -0.01817047, -0.02451248, -0.03029928,
        -0.03239954, -0.03280923, -0.03296474, -0.03065269, -0.03050481,
        -0.02756559, -0.02292475, -0.01887946, -0.01702333, -0.01081101,
        -0.00861707, -0.00453445,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        , -0.00422581,
        -0.0086208 , -0.00935916, -0.01782191, -0.02695858, -0.03689392,
        -0.05349357, -0.07037594, -0.08775445, -0.1

# Build PCA

In [7]:
from sklearn.decomposition import PCA

#define PCA with 0.9 variance to capture
pca = PCA(0.9)

#use it to fit X_train
pca.fit(X_train)

# Look at components

In [8]:
pca.n_components_


232

#Apply the mapping (transform) to both the training set and the test set.



In [18]:
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

ValueError: ignored

#Build a linear model and measure model fitting period.

In [13]:
# Build a linear model using below parameters
from sklearn.linear_model import LogisticRegression
#solver = 'lbfgs'
# multi_class ='auto'

#define the LogisticRegression model
logisticRegr = LogisticRegression(solver = 'lbfgs', multi_class ='auto', max_iter=1000)

import datetime
start= datetime.datetime.now()

# Fit the model
logisticRegr.fit(X_train,y_train)

end= datetime.datetime.now()

print(end-start)


0:01:45.211014


In [14]:
logisticRegr.predict(X_train[0].reshape(1,-1))


array([6], dtype=int8)

#Measuring Model Performance

In [17]:
score = logisticRegr.score(X_test, y_test)
print(score)

0.9225


In [16]:
#Replace numbers with yours
pd.DataFrame(data = [[1.00, 784, 48.94, .9158],
                     [.99, 541, 34.69, .9169],
                     [.95, 330, 13.89, .92],
                     [.90, 236, 10.56, .9168],
                     [.85, 184, 8.85, .9156]],
             columns = ['Variance Retained',
                      'Number of Components',
                      'Time (seconds)',
                      'Accuracy'])

Unnamed: 0,Variance Retained,Number of Components,Time (seconds),Accuracy
0,1.0,784,48.94,0.9158
1,0.99,541,34.69,0.9169
2,0.95,330,13.89,0.92
3,0.9,236,10.56,0.9168
4,0.85,184,8.85,0.9156


In [32]:
from tqdm import tqdm
records = []
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled =  scaler.transform(X_test)
for variance in tqdm([1,0.99,0.95,0.9,0.85]):
  # Apply transform to both the training set and the test set.

  pca = PCA(variance)
  #use it to fit X_train
  pca.fit(X_train_scaled)
  X_train_pca = pca.transform(X_train_scaled)
  X_test_pca  = pca.transform(X_test_scaled)
  start= datetime.datetime.now()
  logisticRegr.fit(X_train_pca, y_train)
  end= datetime.datetime.now()
  score = logisticRegr.score(X_test_pca, y_test)
  time = end-start
  records.append({'Variance Retained':variance,
                  'Number of Components':pca.n_components_,
                  'Time (seconds)':time.seconds,
                  'Accuracy': score})

100%|██████████| 5/5 [09:08<00:00, 109.75s/it]


In [33]:
pd.DataFrame.from_records(records)

Unnamed: 0,Variance Retained,Number of Components,Time (seconds),Accuracy
0,1.0,1,2,0.301786
1,0.99,538,206,0.921714
2,0.95,324,129,0.922857
3,0.9,231,79,0.923643
4,0.85,180,70,0.922571


In [31]:
time = end-start

In [30]:
time.seconds

75