# Classification using MNIST Data Set

Classification practice using the MNIST dataset (which maps pixel images to numeric digits)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import pandas as pd

In [None]:
# from sklearn.datasets import fetch_mldata
# mnist = fetch_mldata('MNIST original')

# mldata is depracated, fetching from openml
# https://stackoverflow.com/questions/53096977

from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, return_X_y=False)

# wait a few minutes for download to complete...

In [None]:
len(mnist['data'])

### Visualization

In [None]:
X, y = mnist['data'], mnist['target']

In [None]:
X

In [None]:
y[60000]

In [None]:
X[60000]

In [None]:
X.shape

In [None]:
y.shape

### Finding a particular digit

In [None]:
np.where(y=='4')

*- shows the dataset indices where the digit 4 can be found*

In [None]:
_ = X[69977]
_image = _.reshape(28,28)
plt.imshow(_image);

In [None]:
np.where(y=='0')

In [None]:
_ = X[34]
_image = _.reshape(28,28)
plt.imshow(_image);

### Split the dataset into two - 60K for training and 10K for testing

In [None]:
split=60000
Xtrain,Xtest, ytrain, ytest = X[:split],X[split:],y[:split],y[split:]

### Shuffle

In [None]:
shuffle_index = np.random.permutation(split)
Xtrain, ytrain = Xtrain[shuffle_index], ytrain[shuffle_index]

In [None]:
shuffle_index

In [None]:
Xtrain.shape

### Training a binary classifier - 0 or not 0

In [None]:
ytrain0 = (ytrain == '0')

In [None]:
ytrain0

In [None]:
ytest0 = (ytest == '0')

## Using SGDClassifier

In [None]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state = 0,max_iter=1000,tol=-3)
sgd.fit(Xtrain, ytrain0)

In [None]:
sgd.predict(X[34].reshape(1,-1)) # 34 is 0

### Measure performance using cross-validation

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skf = StratifiedKFold(n_splits=3, random_state=100)

In [None]:
skf

In [None]:
for train_index, test_index in skf.split(Xtrain, ytrain0):
    sdgc = clone(sgd)
    Xtrain_fold = Xtrain[train_index]
    ytrain_fold = (ytrain0[train_index])
    Xtest_fold = Xtrain[test_index]
    ytest_fold = (ytrain0[test_index])
    sdgc.fit(Xtrain_fold, ytrain_fold)
    
    ypred_fold = sdgc.predict(Xtest_fold)
    n_correct = sum(ypred_fold == ytest_fold)
    print("{0:.4f}".format(n_correct / len(ypred_fold)))

In [None]:
sgd

### Cross Validation Score using K-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
sgd.predict(X[34].reshape(1,-1))

In [None]:
ytest0

In [None]:
cross_val_score(sgd, Xtrain, ytrain, cv=3, scoring='accuracy')

### Confusion Matrix

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
ytrain_pred = cross_val_predict(sgd, Xtrain,ytrain0,cv=3)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(ytrain0, ytrain_pred)

### Precision - accuracy of positive predictions
TP - true positive, FP - false positive, TN - true negative, FN - false negative 

$$ precision = \frac{TP}{{TP}+{FP}} $$

$$ recall = \frac{TP}{{TP}+{FN}} $$

$$ f1 = \frac{TP}{{TP}+{\frac{FN+FP}{2}}} $$

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
precision_score(ytrain0,ytrain_pred) 

In [None]:
recall_score(ytrain0,ytrain_pred)

In [None]:
f1_score(ytrain0,ytrain_pred)

### Precision / Recall Tradeoff 
$$ precision \propto 1 / recall $$

In [None]:
yscores = sgd.decision_function(X[1000].reshape(1,-1))
yscores

In [None]:
threshold = 0
y_some_digits_pred = (yscores > threshold)
y_some_digits_pred

In [None]:
threshold = 4000
y_some_digits_pred = (yscores > threshold)
y_some_digits_pred

In [None]:
yscores = cross_val_predict(sgd, Xtrain, ytrain0,cv=3,method='decision_function')

In [None]:
plt.figure(figsize=(12,8))
plt.hist(yscores,bins=100)

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
p,c,r = precision_recall_curve(ytrain0,yscores)

In [None]:
def plot_prec_rec_vs_thresh(precisions,recalls,thresholds):
    plt.plot(thresholds, precisions[:-1],"b--",label="Precision")
    plt.plot(thresholds, recalls[:-1],"g--",label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([-0.5,1.5])

In [None]:
plt.figure(figsize=(12,8))
plot_prec_rec_vs_thresh(p,c,r)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.plot(p[:-1],r)
plt.xlabel('recalls')
plt.ylabel('precisions')
plt.title('P vs. R tradeoff')
plt.show()

In [None]:
p

### Setting high precisions

In [None]:
len(p)

In [None]:
len(thresholds)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(thresholds, precisions[1:])

In [None]:
idx = len(precisions[precisions < 0.90])
thresholds[idx]
ytrain_pred_90 = (yscores > thresholds[idx])
precision_score(ytrain0,ytrain_pred_90), recall_score(ytrain0,ytrain_pred_90)

### Really high precision

In [None]:
idx = len(precisions[precisions < 0.99])
thresholds[idx]
ytrain_pred_99 = (yscores > thresholds[idx])
precision_score(ytrain0,ytrain_pred_99), recall_score(ytrain0,ytrain_pred_99)

### High recall

In [None]:
idx = len(recalls[recalls < 0.99])
thresholds[idx]
ytrain_pred_99 = (yscores > thresholds[idx])
precision_score(ytrain0,ytrain_pred_99), recall_score(ytrain0,ytrain_pred_99)

### Receiver Operating Characteristics (ROC) Curve

$$ FPR = \frac{FP}{FP+TN} $$


$$ \text{Specificity} = TNR  = 1 - {FPR} $$

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(ytrain0, yscores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr,tpr,linewidth=2, label=label)
    plt.plot([0,1],[0,1],'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC Curve')

In [None]:
plt.figure(figsize=(12,8))
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(ytrain0, yscores)

**Use PR curve when positive class is rare or you care more about false positives**

**Use ROC curve when negative class is rare or you care more about false negatives**

## Model Comparison

### Random Forest ##

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(random_state=0)

In [None]:
yprobas_forest= cross_val_predict(rfc, Xtrain, ytrain0,cv=3, method='predict_proba')

In [None]:
yscores_forest = yprobas_forest[:,1]
fpr_forest, tpr_forest, threshold_forest = roc_curve(ytrain0,yscores_forest)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(fpr,tpr,"b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest,"Random Forest")
plt.show()

In [None]:
roc_auc_score(ytrain0, yscores_forest)

In [None]:
ytrain_rf=cross_val_predict(rfc, Xtrain, ytrain0,cv=3)

In [None]:
precision_score(ytrain0,ytrain_rf)

In [None]:
recall_score(ytrain0,ytrain_rf)

In [None]:
f1_score(ytrain0,ytrain_rf)

In [None]:
confusion_matrix(ytrain0, ytrain_rf)