In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

RS = 42

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:40000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[40000:])]))[:, 1]
    mnist.data[:40000] = mnist.data[reorder_train]
    mnist.target[:40000] = mnist.target[reorder_train]
    mnist.data[40000:] = mnist.data[reorder_test + 40000]
    mnist.target[40000:] = mnist.target[reorder_test + 40000]

In [3]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [4]:
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')

In [5]:
X, y = mnist["data"], mnist["target"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RS)

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline_1 = Pipeline([
    ('scaler', StandardScaler()),
])

X_train_standarted = pipeline_1.fit_transform(X_train)

In [8]:
from sklearn import neighbors # k near neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=10)

In [9]:
%%time
from sklearn.model_selection import RandomizedSearchCV
params = {
    "weights": ["uniform", "distance"], 
#     "algorithm":["auto", "ball_tree", "kd_tree", "brute"]
#     "algorithm":["auto", "kd_tree"]
}
rsearch = RandomizedSearchCV(estimator=knn, param_distributions=params, 
#                              cv=3, n_iter=8,random_state=RS, scoring="accuracy")
                             cv=2, n_iter=2,random_state=RS, scoring="accuracy")
rsearch.fit(X_train_standarted, y_train)

CPU times: user 1h 4min 7s, sys: 3.36 s, total: 1h 4min 11s
Wall time: 1h 4min 32s


RandomizedSearchCV(cv=2, error_score=nan,
                   estimator=KNeighborsClassifier(algorithm='auto',
                                                  leaf_size=30,
                                                  metric='minkowski',
                                                  metric_params=None,
                                                  n_jobs=None, n_neighbors=10,
                                                  p=2, weights='uniform'),
                   iid='deprecated', n_iter=2, n_jobs=None,
                   param_distributions={'weights': ['uniform', 'distance']},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=False, scoring='accuracy', verbose=0)

In [10]:
print(rsearch.best_score_)
print(rsearch.best_estimator_)

0.9340892857142857
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='distance')


In [11]:
import pickle
filename = 'best_knn_model.sav'
pickle.dump(rsearch.best_estimator_, open(filename, 'wb'))

In [12]:
import pickle
filename = 'best_knn_model.sav'
best_knn_model = pickle.load(open(filename, 'rb'))

In [13]:
pipeline_2 = Pipeline([
    ('scaler', StandardScaler()),
])

X_test_standarted = pipeline_2.fit_transform(X_test)

In [14]:
y_pred_proba = best_knn_model.predict_proba(X_test_standarted)
y_pred = best_knn_model.predict(X_test_standarted)

In [15]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97      1393
           1       0.95      0.99      0.97      1534
           2       0.96      0.92      0.94      1439
           3       0.94      0.94      0.94      1469
           4       0.95      0.95      0.95      1328
           5       0.93      0.94      0.93      1243
           6       0.96      0.97      0.96      1351
           7       0.93      0.94      0.93      1465
           8       0.96      0.89      0.93      1392
           9       0.92      0.92      0.92      1386

    accuracy                           0.94     14000
   macro avg       0.94      0.94      0.94     14000
weighted avg       0.94      0.94      0.94     14000

