In [2]:
from IPython.core.display import display, HTML

display(HTML('<h1>MNIST</h1>'))

In [3]:
# Minst dataset consists of 70,000 small images, handwritten by high school students and 
# employees of US Census Bureau

# Fetch MINIST data
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original') 
mnist

{'COL_NAMES': ['label', 'data'],
 'DESCR': 'mldata.org dataset: mnist-original',
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 'target': array([ 0.,  0.,  0., ...,  9.,  9.,  9.])}

In [4]:
X,y = mnist["data"], mnist["target"]

#There are 70,000 images, each image has 784 features -> Each image is 28 * 28 pixels
#Each pixel is from 0(White) to 255(Black)
X[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  51, 159, 253,
       159,  50,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  48, 238,
       252, 252, 252, 237,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [5]:
#y has numbers from 0 to 9 -> Represents the class lables
y

array([ 0.,  0.,  0., ...,  9.,  9.,  9.])

In [None]:
# To view one digit from dataset, grab an instance's feature vector, reshape it to 28 * 28
# Use Matplotlib's imshow() function

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

digit = X[0]
digit_image = digit.reshape(28,28)

plt.imshow(digit_image, cmap = matplotlib.cm.binary, interpolation = "nearest")

In [6]:
y[0]

0.0

In [7]:
#MNIST dataset is already split into train and test set. 
#MNIST dataset is already split into training and test dataset , first 60000 for training
#and the rest for testing
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [8]:
#The dataset has to be shuffled so that all cross - validation folds will not be similar 
#i.e. we do not want one fold to be missing some digits

import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


In [9]:
display(HTML('<h1>Training A Binary Classifier</h1>'))

In [10]:
#5-detector, distinguishing between just two classes, 5 and not-5

#True for all 5's, and false for other 5's
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [11]:
#Stochastic Gradient Descent(SGD) classifier:

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42,max_iter=1000)
sgd_clf.fit(X_train,y_train_5)

sgd_clf.predict([X_test[0]])

array([False], dtype=bool)

In [14]:
display(HTML('<h1>Performance Measures</h1>'))
display(HTML('<h3>1. Measure accuracy using Cross-Validation</h3>'))

In [20]:
#Implementing StratifiedKFold

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
skfolds = StratifiedKFold(n_splits=3, random_state = 42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_folds = X_train[test_index]
    y_test_folds = y_train_5[test_index]
    
    clone_clf.fit(X_train_folds,y_train_folds)
    y_pred  = clone_clf.predict(X_test_folds)
    n_correct = sum(y_pred == y_test_folds)
    
    print(n_correct/len(y_pred))    

0.94015
0.96275
0.9664


In [22]:
#Using cross_val_score()
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf,X_train,y_train,cv = 3,scoring="accuracy")


array([ 0.88617277,  0.8740937 ,  0.89188378])

In [None]:
#Though the accuracy is >90%, we cannot say the model is always right because 
#even a dumb classifier would give good results
#This is because only 10% of the data has class 5 and even if the model predics wrong
#class, the accuracy will always be 90% 

In [31]:
from sklearn.base import BaseEstimator

#np.zeros((5,1), dtype=bool)
#np.ones((5,1), dtype=bool)

class Never5Classifier(BaseEstimator):
    def fit(self,X,y=None):
        pass
    
    def predict(self,X):
        return np.zeros((len(X),1),dtype=bool)

#Guessing the models accuracy with the ablove class

never_5_clf = Never5Classifier()
cross_val_score(never_5_clf,X_train,y_train_5,cv=3,scoring="accuracy")

array([ 0.91225,  0.90715,  0.90955])

In [None]:
display(HTML('<h1>Confusiion Matrix</h1>'))

In [None]:
# The above method also gives an accuracy > 90%. So its not always prefferred to 
# use accuracy as a preformance measure. So we go for Confusion Matrix

In [33]:
#Using cross_val_predict() to predict values -> 
#To avoid using test data as of now for confusion matrix

from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
y_train_pred

array([ True, False, False, ..., False, False, False], dtype=bool)

In [36]:
#To find the confusion matrix
print(y_train_pred.shape)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)


(60000,)


array([[53020,  1559],
       [ 1055,  4366]])