# Setup

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
from pathlib import Path as path
from sklearn.datasets import fetch_openml

data_home = path(f"/Users/nicolas/hands_on_ml2_datasets/{CHAPTER_ID}")

if not path('/Users/nicolas/datasets/hands_on_ml2/classification/openml/openml.org/api/v1/json/data/list/data_name/mnist_784').exists():
    mnist = fetch_openml('mnist_784', version=1, as_frame=False, data_home=data_home)
else:
    mnist = fetch_openml(data_home='/Users/nicolas/datasets/hands_on_ml2/classification/', data_id='554', as_frame=False)
mnist.keys()

In [None]:
X, y = mnist["data"], mnist["target"]
X.shape

In [None]:
mnist['details']

In [None]:
y.shape

In [None]:
import pandas as pd

pd.Series(y).value_counts().sort_index()

In [None]:
y_test = y[60000:]

pd.Series(y_test).value_counts().sort_index()

In [None]:
X.shape

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis("off")

save_fig("some_digit_plot")
plt.show()

In [None]:
y[0]

In [None]:
# from string to intiger
y = y.astype(np.uint8)

In [None]:
# function that plots single digit
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")


In [None]:
# plot multiple digits

# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    # This is equivalent to n_rows = ceil(len(instances) / images_per_row):
    n_rows = (len(instances) - 1) // images_per_row + 1

    # Append empty images to fill the end of the grid, if needed:
    n_empty = n_rows * images_per_row - len(instances)
    padded_instances = np.concatenate([instances, np.zeros((n_empty, size * size))], axis=0)

    # Reshape the array so it's organized as a grid containing 28×28 images:
    image_grid = padded_instances.reshape((n_rows, images_per_row, size, size))

    # Combine axes 0 and 2 (vertical image grid axis, and vertical image axis),
    # and axes 1 and 3 (horizontal axes). We first need to move the axes that we
    # want to combine next to each other, using transpose(), and only then we
    # can reshape:
    big_image = image_grid.transpose(0, 2, 1, 3).reshape(n_rows * size,
                                                         images_per_row * size)
    # Now that we have a big image, we just need to show it:
    plt.imshow(big_image, cmap = mpl.cm.binary, **options)
    plt.axis("off")
    

In [None]:
plt.figure(figsize=(9,9))
example_images = X[:100]
plot_digits(example_images, images_per_row=10)
save_fig("more_digits_plot")
plt.show()

In [None]:
y[0]

In [None]:
# data is already split for us so all we need to do is allocate it to the train and test sets
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Training a Binary Classifier

##### We want to be able to identify if a digit is a 5 or not

In [None]:
y_train_5 = (y_train == 5) # all 5s evaluate to True
y_test_5 = (y_test == 5)

In [None]:
from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state= 42)
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([some_digit])

##### The result above shows that "some_digit" is actually a 5 --> Below is the proof

In [None]:
plot_digit(some_digit)

In [None]:
sgd_clf.get_params()

# Performance Measures

##### Measuring accuracy using Cross-Validation

In [None]:
# Custom implementation of the cross validation method in sklearn 
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) #Shuffle omitted in book by mistake 

for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

In [None]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train,y_train_5, cv=3, scoring='accuracy')

In [None]:
# But let's see how a dumb classifier performs when it classifies
# everything as not 5

from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [None]:
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

In [None]:
''' The above makes sense as we expect roughly 90% of the digits to be not a 5
We can conclude that accuracy is not always the best metric to evaluate the
performance of classifier models, especially when dealing with skewed datasets'''

##### Confusion Matrix

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, n_jobs=-1)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)

In [None]:
# this is what a perfect prediction would look like

y_train_perfect_pred = y_train_5

cm = confusion_matrix(y_train_5, y_train_pred)

cm

##### Precision, Recall

In [None]:
from sklearn.metrics import precision_score, recall_score

print(f'Precision: {precision_score(y_train_pred,y_train_5)}')

print(f'Recall: {recall_score(y_train_pred,y_train_5)}')

In [None]:
'''According to the above, precision is 83% which means that of all the poisitively identified
samples only 65% were identified correct.

A recall of 83% means that of all the positive samples only 83% of them were identified correctly'''

In [None]:
from pprint import pprint

pprint('''
          Prediction
       -----------------
      |   TN   |   FP   |
Actual -----------------
      |   FN   |   TP   |
       -----------------

C_{0,0} --> TN  
C_{1,0} --> FN  
C_{0,1} --> FP  
C_{1,1} --> TP  

Precision = TP/(TP+FP)
Recall = TP/(TP+FN)

''')

In [None]:
index = ['Negative','Positive']

cm_df = pd.DataFrame(cm,columns=index, index=index)

cm_df

In [None]:
# another way to calculate percision is
# TP/TP+FP
f'Precision: {cm[1,1]/(cm[1,1]+cm[1,0])}'

In [None]:
# another way to calculate recall is
# TP/TP+FN
f'Recall: {cm[1,1]/(cm[1,1]+cm[0,1])}'