In [2]:
import numpy as np
import pandas as pd

In [2]:
# MNIST
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
x, y = mnist['data'], mnist['target']
x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:]

In [4]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((60000, 784), (10000, 784), (60000,), (10000,))

In [5]:
# Question 1

In [7]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [10]:
# Performance validation
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

y_train_pred = cross_val_predict(knn_clf, x_train, y_train, cv=5)
confusion_matrix(y_train, y_train_pred)

array([[5883,    3,    5,    0,    1,    6,   16,    2,    2,    5],
       [   1, 6710,    8,    1,    2,    0,    3,   14,    1,    2],
       [  40,   60, 5713,   13,    8,    4,   10,   83,   18,    9],
       [   8,   15,   40, 5920,    1,   54,    3,   33,   34,   23],
       [   3,   52,    2,    1, 5639,    0,   14,   12,    1,  118],
       [  16,   13,    4,   75,    5, 5208,   64,    8,    5,   23],
       [  26,   13,    2,    0,    7,   21, 5848,    0,    1,    0],
       [   3,   68,   15,    2,   14,    1,    0, 6092,    3,   67],
       [  20,   72,   16,   85,   30,   95,   26,   18, 5429,   60],
       [  17,   11,    5,   40,   58,   11,    3,   78,   10, 5716]],
      dtype=int64)

In [14]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_train, y_train_pred, average='micro'), recall_score(y_train, y_train_pred, average='micro')

(0.9693, 0.9693)

In [15]:
from sklearn.metrics import f1_score
f1_score(y_train, y_train_pred, average='micro')

0.9693

In [None]:
# Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    'weights': ['uniform', 'distance'],
    'n_neighbors' : [3,5,7,9]
}

knn_clf = KNeighborsClassifier()
knn_clf_grid = GridSearchCV(knn_clf, param_grid, n_jobs=-1, cv=3, verbose=3,
                    scoring='accuracy')

knn_clf_grid.fit(x_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
# Model inspection
knn_clf.best_estimator_

In [None]:
# Test set evaluation
y_test_pred = knn_clf.predict(x_test)

In [None]:
# Text set accuracy 
from sklearn.model_selection import accracy
accuracy_score(y_test, y_test_pred)

In [None]:
# Question 2

In [None]:
# Image augmentation
import scipy.ndimage.interpolation as inp

# Shifting right, left, down, up 1 pixel
x_train_rs = inp.shift(x_train, [0,0,1])
x_train_ls = inp.shift(x_train, [0,0,-1])
x_train_ds = inp.shift(x_train, [0,1,0])
x_train_us = inp.shift(x_train, [0,-1,0])

# Concatenating to the training set
x_train_augmented = pd.concat([x_train, x_train_rs, x_train_ls, x_train_ds, x_train_us])
y_train_augmented = pd.concat([y_train, y_train, y_train, y_train, y_train])


In [1]:
# Question 3

In [3]:
import os
import tarfile
import urllib

In [None]:
download_root = 'https://spamassassin.apache.org/old/publiccorpus/'
ham_url = download_root + '20030228_hard_ham.tar.bz2'
spam_url = download_root + '20050311_spam_2.tar.bz2'
spam_path = os.path.join('datasets', 'spam')

def fetch_spam_data(spam_url=spam_url, spam_path=spam_path):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (('ham.tar.bz2', ham_url), ('2.tar.bz2', spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.clode()

In [None]:
fecth_spam_data()

In [None]:
ham_dir = os.path.join(spam_path, 'hard_ham')
spam_dir = os.path.join(spam_path, 'spam_2')
ham_filenames = [name for name in sorted()]