# PCA on MNIST analysis.

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report    

In [2]:
df=load_digits()
x=df.data
y=df.target

In [3]:
df

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 'images': array([[[ 0.,  0.,  5., ...,  1.,  0.,  0.],
         [ 0.,  0., 13., ..., 15.,  5.,  0.],
         [ 0.,  3., 15., ..., 11.,  8.,  0.],
         ...,
         [ 0.,  4., 11., ..., 12.,  7.,  0.],
         [ 0.,  2., 14., ..., 12.,  0.,  0.],
         [ 0.,  0.,  6., ...,  0.,  0.,  0.]],
 
        [[ 0.,  0.,  0., ...,  5.,  0.,  0.],
         [ 0.,  0.,  0., ...,  9.,  0.,  0.],
         [ 0.,  0.,  3., ...,  6.,  0.,  0.],
         ...,
         [ 0.,  0.,  1., ...,  6.,  0.,  0.],
         [ 0.,  0.,  1., ...,  6.,  0.,  0.],
         [ 0.,  0.,  0., ..., 10.,  0.,  0.]],
 
        [[ 0

In [4]:
x,y

(array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]), array([0, 1, 2, ..., 8, 9, 8]))

# Splitting the data

In [5]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.22,random_state=2)

## FEATURE SCALING

In [6]:
x=StandardScaler().fit_transform(x)
x

array([[ 0.        , -0.33501649, -0.04308102, ..., -1.14664746,
        -0.5056698 , -0.19600752],
       [ 0.        , -0.33501649, -1.09493684, ...,  0.54856067,
        -0.5056698 , -0.19600752],
       [ 0.        , -0.33501649, -1.09493684, ...,  1.56568555,
         1.6951369 , -0.19600752],
       ...,
       [ 0.        , -0.33501649, -0.88456568, ..., -0.12952258,
        -0.5056698 , -0.19600752],
       [ 0.        , -0.33501649, -0.67419451, ...,  0.8876023 ,
        -0.5056698 , -0.19600752],
       [ 0.        , -0.33501649,  1.00877481, ...,  0.8876023 ,
        -0.26113572, -0.19600752]])

## PERFORMING PCA

In [7]:
pca=PCA(n_components=40)
pca.fit(x)

PCA(copy=True, iterated_power='auto', n_components=40, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [8]:
pca.explained_variance_ratio_

array([0.12033916, 0.09561054, 0.08444415, 0.06498408, 0.04860155,
       0.0421412 , 0.03942083, 0.03389381, 0.02998221, 0.02932003,
       0.02781805, 0.02577055, 0.02275303, 0.0222718 , 0.02165229,
       0.01914167, 0.01775547, 0.01638069, 0.0159646 , 0.01489191,
       0.01347969, 0.01271931, 0.01165837, 0.01057646, 0.00975315,
       0.00944555, 0.00863012, 0.00836642, 0.00797693, 0.00746465,
       0.00725574, 0.00691893, 0.00653907, 0.00640769, 0.0059135 ,
       0.00571077, 0.00523474, 0.00481157, 0.00453369, 0.00422604])

In [9]:
principalComponents =pca.fit_transform(x)
principalComponents

array([[ 1.91421366, -0.95450156, -3.94603477, ...,  0.81135586,
         0.03938286,  0.30350434],
       [ 0.5889804 ,  0.92463572,  3.92475518, ...,  0.19483242,
         0.10284047, -0.49058005],
       [ 1.30203906, -0.31718883,  3.02333296, ..., -0.21145872,
        -1.25668732,  0.54494697],
       ...,
       [ 1.02259601, -0.14791085,  2.46997379, ...,  0.60227622,
         0.39553491,  1.23618473],
       [ 1.07605521, -0.38090617, -2.45548706, ...,  0.44055892,
        -0.69307254, -0.45828747],
       [-1.2577023 , -2.22759095,  0.28362806, ..., -0.38043626,
         0.11415556,  1.36764359]])

## LOGISTIC CLASSIFICATION

In [10]:
model=LogisticRegression()

In [11]:
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
pred=model.predict(x_test)
pred

array([4, 0, 9, 1, 8, 7, 1, 5, 1, 6, 6, 7, 6, 1, 5, 5, 1, 6, 2, 7, 4, 6,
       4, 1, 5, 2, 9, 5, 4, 6, 5, 6, 3, 4, 0, 9, 9, 8, 4, 6, 8, 1, 5, 7,
       8, 8, 9, 6, 1, 7, 0, 1, 9, 7, 3, 3, 1, 8, 8, 8, 9, 8, 5, 1, 4, 8,
       7, 5, 8, 4, 3, 9, 3, 8, 7, 3, 3, 0, 8, 7, 2, 8, 5, 3, 8, 7, 6, 4,
       6, 2, 2, 0, 1, 1, 5, 3, 5, 7, 6, 8, 2, 2, 6, 4, 6, 7, 3, 7, 3, 9,
       4, 7, 0, 3, 5, 1, 5, 0, 3, 9, 2, 7, 7, 2, 0, 8, 1, 9, 2, 1, 5, 1,
       0, 3, 4, 3, 0, 8, 3, 2, 2, 7, 3, 1, 6, 7, 2, 8, 3, 1, 1, 6, 4, 8,
       2, 1, 8, 4, 8, 3, 1, 1, 9, 5, 4, 9, 7, 4, 8, 9, 5, 7, 6, 9, 0, 0,
       4, 0, 0, 9, 0, 6, 5, 8, 8, 3, 7, 8, 2, 0, 8, 2, 7, 3, 0, 2, 1, 5,
       2, 7, 0, 6, 9, 3, 3, 1, 3, 5, 2, 8, 5, 2, 1, 2, 9, 4, 6, 5, 5, 5,
       9, 7, 1, 5, 7, 6, 3, 7, 1, 7, 5, 1, 7, 2, 7, 5, 5, 4, 8, 6, 6, 2,
       8, 7, 3, 7, 8, 0, 3, 5, 7, 4, 3, 4, 1, 0, 3, 3, 5, 4, 1, 3, 1, 2,
       5, 1, 4, 0, 3, 1, 5, 5, 7, 4, 0, 1, 0, 8, 5, 5, 5, 4, 0, 1, 8, 6,
       2, 1, 1, 1, 7, 9, 6, 7, 9, 7, 0, 4, 9, 6, 9,

## Score

In [13]:
model.score(x_test,y_test)

0.9393939393939394

## confusion matrix

In [14]:
matrix =confusion_matrix(y_test,pred)
print(matrix)

[[36  0  0  0  0  0  0  0  0  0]
 [ 0 42  0  1  0  0  0  0  2  1]
 [ 0  0 40  0  0  0  0  0  0  0]
 [ 0  0  0 36  0  0  0  3  2  0]
 [ 0  2  0  0 32  0  0  0  1  1]
 [ 0  0  0  0  0 45  0  0  0  0]
 [ 0  0  0  0  0  0 40  0  1  0]
 [ 0  0  0  0  0  0  0 41  1  0]
 [ 0  2  0  0  0  0  0  0 34  1]
 [ 0  0  0  1  0  1  0  1  3 26]]


## Classification Report

In [15]:
print(classification_report(y_test,pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        36
          1       0.91      0.91      0.91        46
          2       1.00      1.00      1.00        40
          3       0.95      0.88      0.91        41
          4       1.00      0.89      0.94        36
          5       0.98      1.00      0.99        45
          6       1.00      0.98      0.99        41
          7       0.91      0.98      0.94        42
          8       0.77      0.92      0.84        37
          9       0.90      0.81      0.85        32

avg / total       0.94      0.94      0.94       396

