In [3]:
import pandas as pd
import numpy as np

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap=mpl.cm.binary, interpolation="nearest")
    plt.axis("off")

In [None]:
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

In [None]:
from sklearn.datasets import fetch_openml

In [None]:
mnist = fetch_openml('mnist_784', version=1, return_X_y=False)

In [None]:
X, y = mnist["data"], mnist["target"]
print(X.shape)
print(y.shape)

In [None]:
some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)

In [None]:
plt.imshow(some_digit_image, cmap=mpl.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()

In [None]:
y[36000]

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [None]:
y_train_9 = (y_train == '9')
y_test_9  = (y_test  == '9')

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd_clf = SGDClassifier(random_state=32)
sgd_clf.fit(X_train, y_train_9)

In [None]:
sgd_clf.predict([some_digit])

In [None]:
from sklearn.model_selection import cross_val_score
# cross_val_score(sgd_clf, X_train, y_train_9, cv=3, scoring="accuracy")

In [None]:
from sklearn.base import BaseEstimator

In [None]:
class Never9Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [None]:
never_9_clf = Never9Classifier()
# cross_val_score(never_9_clf, X_train, y_train_9, cv=3, scoring="accuracy")

In [None]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_9, cv=2)

In [None]:
print(y_train_pred.shape, y_train_9.shape)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train_9, y_train_pred)

In [None]:
from sklearn.utils.multiclass import unique_labels

In [None]:
classes = unique_labels(y_train_9, y_train_pred)

In [None]:
classes

In [None]:
print(cm)
plt.imshow(cm, cmap=plt.cm.Blues)
plt.show()

In [None]:
TN, FP, FN, TP = cm.ravel()

In [None]:
precision = TP / (TP + FP)
recall    = TP / (TP + FN)

In [None]:
print(precision, recall)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
print(precision_score(y_train_9, y_train_pred), recall_score(y_train_9, y_train_pred))

In [None]:
f1_ = 2 * (precision * recall) / (precision + recall)
print(f1_, f1_score(y_train_9, y_train_pred))

In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_9, cv=2, method="decision_function")

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_9, y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="center left")
    plt.ylim([0, 1])

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
data = [1, 2, 3]
print(data, data[:-1])
print(len(thresholds), len(precisions), len(recalls))

In [None]:
plt.plot(recalls, precisions)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_9, y_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], "k--")
    plt.axis([0, 1, 0, 1])
    plt.xlabel("False positive rate")
    plt.ylabel("True positive rate")
    
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_9, y_scores)

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=32, n_estimators=10)
y_scores_proba = cross_val_predict(forest_clf, X_train, y_train_9, cv=2, method="predict_proba")

In [None]:
y_scores_forest = y_scores_proba[:, 1] # probabilities of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_9, y_scores_forest)

In [None]:
plt.plot(fpr, tpr, "b:", label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, label="Random Forest")
plt.legend()
plt.show()

In [None]:
roc_auc_score(y_train_9, y_scores_forest)

In [None]:
sgd_clf.fit(X_train, y_train)

In [None]:
prediction = sgd_clf.predict([some_digit])
some_digit_scores = sgd_clf.decision_function([some_digit]) 

In [None]:
print(np.argmax(some_digit_scores))
print(some_digit_scores)
print(prediction, y[36000])
print(sgd_clf.classes_)

In [None]:
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=32))
ovo_clf.fit(X_train, y_train)

In [None]:
print(ovo_clf.predict([some_digit]))
print(len(ovo_clf.estimators_))

In [None]:
forest_clf.fit(X_train, y_train)

In [None]:
print(forest_clf.predict([some_digit]))
print(forest_clf.predict_proba([some_digit]))

In [None]:
# cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
# cross_val_score(forest_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

pl = Pipeline([
   ("scaler", StandardScaler())
])

In [None]:
# cross_val_score(sgd_clf, pl.fit_transform(X_train), y_train, cv=3, scoring="accuracy")

In [None]:
pl = Pipeline([
   ("scaler", MinMaxScaler())
])
X_train_scaled = pl.fit_transform(X_train)

In [None]:

# cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=2)

In [None]:
cm = confusion_matrix(y_train, y_train_pred)

In [None]:
cm

In [None]:
plt.matshow(cm, cmap=plt.cm.gray)
plt.show()

In [None]:
row_sums = cm.sum(axis=1, keepdims=True)
norm_cm = cm / row_sums
norm_cm

np.fill_diagonal(norm_cm, 0)
plt.matshow(norm_cm, cmap=plt.cm.gray)
plt.show()

In [None]:
cl_a, cl_b = '3', '5'
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

In [None]:
plt.figure(figsize=(8, 8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
y_train = y_train.astype(np.int32)
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_multilabel = np.c_[y_train_large, y_train_odd]
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [None]:
pred = knn_clf.predict([some_digit])
print(pred)

In [None]:
# y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)

In [None]:
# f1_score(y_multilabel, y_train_knn_pred, average="macro")

In [None]:
# f1_score(y_multilabel, y_train_knn_pred, average="weighted")

### Multioutput classification

In [None]:
noise = np.random.randint(0, 100, X_train.shape)
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, X_test.shape)
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

In [None]:
plt.subplot(221); plot_digit(X_test_mod[6000])
plt.subplot(222); plot_digit(y_test_mod[6000])
plt.show()

In [None]:
knn_clf.fit(X_train_mod, y_train_mod)

In [None]:
clean_digit = knn_clf.predict([X_test_mod[6000]])
plt.subplot(221); plot_digit(clean_digit)
plt.subplot(222); plot_digit(y_test_mod[6000])
plt.show()

### Exercise 1
Achieving over 97% accuracy

In [None]:
knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=4, n_jobs=-1)
knn_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score
y_test_pred = knn_clf.predict(X_test)

In [None]:
accuracy_score(y_test.astype(np.int32), y_test_pred)

In [None]:
#cross_val_score(knn_clf, X_train_scaled, y_train, cv=2, scoring="accuracy")

In [None]:
# from sklearn.model_selection import GridSearchCV

# # param_grid = [{
# #     "weights": ["uniform", "distance"], 
# #     "n_neighbors": [3, 5, 7, 9]
# # }]

# param_grid = [{
#     "weights": ["uniform"], 
#     "n_neighbors": [7]
# }]

# grid_search = GridSearchCV(knn_clf, param_grid, cv=3, scoring="accuracy", verbose=2, n_jobs=2)
# grid_search.fit(X_train, y_train)

In [None]:
# cv_results = grid_search.cv_results
# print(cv_results)
# for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
#     print(np.sqrt(-mean_score), params)

In [None]:
plot_digit(X[36000])

In [None]:
def shift_image(image, direction, image_size=(28, 28), pixels=1):
    x_size, y_size = image_size
    shifted_image = None
    if direction == "left":
        shifted_image = ndimage.shift(image.reshape(x_size, y_size), [0.0, -pixels])
    elif direction == "right":
        shifted_image = ndimage.shift(image.reshape(x_size, y_size), [0.0, pixels])
    elif direction == "up":
        shifted_image = ndimage.shift(image.reshape(x_size, y_size), [-pixels, 0.0])
    else:
        shifted_image = ndimage.shift(image.reshape(x_size, y_size), [pixels, 0.0])        
    return shifted_image.ravel()    

def shift_left(image):
    return shift_image(image, "left")

def shift_right(image):
    return shift_image(image, "right")

def shift_up(image):
    return shift_image(image, "up")

def shift_down(image):
    return shift_image(image, "down")

In [None]:
# vshift_left = np.vectorize(shift_left)
# vshift_right = np.vectorize(shift_right)
# vshift_up = np.vectorize(shift_up)
# vshift_down = np.vectorize(shift_down)

In [None]:
up_1px = [shift_up(x) for x in X_train]
down_1px = [shift_down(x) for x in X_train]
left_1px = [shift_left(x) for x in X_train]
right_1px = [shift_right(x) for x in X_train]

In [None]:
X_train_aug = np.concatenate([X_train, np.array(up_1px), np.array(down_1px), np.array(left_1px), np.array(right_1px)])
y_train_aug = np.concatenate([y_train, y_train, y_train, y_train, y_train])

In [None]:
print(X_train_aug.shape)
print(y_train_aug.shape)

In [None]:
knn_clf = KNeighborsClassifier(weights="distance", n_neighbors=4, n_jobs=-1)
knn_clf.fit(X_train_aug, y_train_aug)

In [None]:
y_test_pred = knn_clf.predict(X_test)

In [None]:
accuracy_score(y_test.astype(np.int32), y_test_pred)

In [None]:
import scipy.ndimage as ndimage

shifted_img = ndimage.shift(X_train[36000].reshape(28,28), [1.0, 0.0])
plot_digit(shifted_img)

In [None]:
shifted_img = ndimage.shift(X[36000].reshape(28,28), [-1.0, 0.0])
plot_digit(shifted_img)