### This notebook is used to learn following:
#### 1. Importance of Feature Selection
#### 2. Implementation of PCA
#### 3. Using two classifiers Logistic Regression and SVM for this

In [1]:
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split

#### Download and Load the Data

In [2]:
mnist = fetch_openml('mnist_784')

#### This is a Huge dataset

In [3]:

# These are the images
mnist.data.shape

(70000, 784)

In [4]:
# test_size: what proportion of original data is used for test set
train_img, test_img, train_lbl, test_lbl = train_test_split(
    mnist.data, mnist.target, test_size=1/7.0, random_state=0)
print(train_img.shape)
print(test_img.shape)

(60000, 784)
(10000, 784)


###  Standard Scaler is used to Standardize features by removing the mean and scaling to unit variance.

In [5]:

scaler = StandardScaler()

# Fit on training set only.
scaler.fit(train_img)

# Apply transform to both the training set and the test set.
train_img = scaler.transform(train_img)
test_img = scaler.transform(test_img)

### Implementing Principle Component Analysis
Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD.

In [6]:
pca = PCA(.95)

In [7]:
pca.fit(train_img)

PCA(n_components=0.95)

In [8]:
pca.n_components_

327

#### Apply the mapping (transform) to both the training set and the test set.

In [9]:
train_img = pca.transform(train_img)
test_img = pca.transform(test_img)

#### Step 1: Import the model you want to use

In sklearn, all machine learning models are implemented as Python classes

In [10]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(solver = 'lbfgs')

#### Step 2: Make an instance of the Model

In [11]:
logisticRegr.fit(train_img, train_lbl)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [12]:

# Predict for Multiple Observations (images) at Once
logisticRegr.predict(test_img[0:10])

array(['0', '4', '1', '2', '4', '7', '7', '1', '1', '7'], dtype=object)

#### Measuring Model Performance
accuracy (fraction of correct predictions): correct predictions / total number of data points

Basically, how the model performs on new data (test set)

In [13]:
score = logisticRegr.score(test_img, test_lbl)
print(score)

0.9201


In [17]:
# Import the `svm` model
from sklearn import svm

# Create the SVC model 
svc_model = svm.SVC(gamma=0.001, C=100., kernel='linear')

# Fit the data to the SVC model
svc_model.fit(train_img[:10000], train_lbl[:10000])

SVC(C=100.0, gamma=0.001, kernel='linear')

In [18]:
# Import `metrics`
from sklearn import metrics

predicted = svc_model.predict(test_img)
# Print the classification report of `y_test` and `predicted`
print(metrics.classification_report(predicted, test_lbl))

# Print the confusion matrix
print(metrics.confusion_matrix(predicted, test_lbl))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96      1020
           1       0.98      0.95      0.96      1182
           2       0.89      0.87      0.88      1073
           3       0.86      0.85      0.86      1032
           4       0.91      0.88      0.90       999
           5       0.83      0.84      0.84       855
           6       0.93      0.96      0.94       963
           7       0.91      0.91      0.91      1060
           8       0.82      0.89      0.85       888
           9       0.84      0.88      0.86       928

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[ 967    0    6    1    2   17   14    4    4    5]
 [   0 1117   21    9    5    5    0    3   19    3]
 [   6    6  930   34   12   11   14   18   33    9]
 [   3    2   19  876    2   65    0   13   44    8]
 [   0    1   12    3  879   