In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# MNIST

In [2]:
# Loading MNIST data

from six.moves import urllib
from sklearn.datasets import fetch_mldata
import requests
requests.packages.urllib3.disable_warnings()

"""
Adapted from the Github repo:
https://github.com/ageron/handson-ml
for the 03_classification notebook.
This implementation uses the 'requests' package instead of URLLIB
"""

try:
    mnist = fetch_mldata('MNIST original')
except urllib.error.HTTPError as ex:
    print("Could not download MNIST data from mldata.org, trying alternative...")

    # Alternative method to load MNIST, if mldata.org is down
    from scipy.io import loadmat
    mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
    mnist_path = "./mnist-original.mat"
    response = requests.get(mnist_alternative_url)
    with open(mnist_path, "wb") as f:
        content = response.content
        f.write(content)
    mnist_raw = loadmat(mnist_path)
    mnist = {
        "data": mnist_raw["data"].T,
        "target": mnist_raw["label"][0],
        "COL_NAMES": ["label", "data"],
        "DESCR": "mldata.org dataset: mnist-original",
    }
    print("Success!")

OSError: could not read bytes

In [11]:
mnist

Datasets(train=<tensorflow.contrib.learn.python.learn.datasets.mnist.DataSet object at 0x127390978>, validation=<tensorflow.contrib.learn.python.learn.datasets.mnist.DataSet object at 0x1273907b8>, test=<tensorflow.contrib.learn.python.learn.datasets.mnist.DataSet object at 0x1273909b0>)

In [12]:
X, Y = mnist["data"], mnist["target"]

TypeError: tuple indices must be integers or slices, not str

In [None]:
X.shape

In [None]:
Y.shape

In [None]:
import matplotlib
import matplotlib.pyplot as plt

some_digit = X[36000]
some_digit_image = some_digit.reshape(28,28)

plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()

In [None]:
Y[36000]
X[36000]

In [None]:
# Separating into test and train sets

X_train, X_test, Y_train, Y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]

In [None]:
import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, Y_train = X_train[shuffle_index], Y_train[shuffle_index]

In [None]:
Y_train[36000]

In [None]:
Y_train_5 = (Y_train==5)
Y_test_5 = (Y_test==5)

In [None]:
Y_train_5[36000]

In [None]:
#SGD binary classifier (fitting and prediction)

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(max_iter=5, random_state = 42)
sgd_clf.fit(X_train, Y_train_5)

In [None]:
sgd_clf.predict([X_train[36000]])

In [None]:
#Accuracy: Cross-validation of SGD binary classifier

from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, Y_train_5, cv=3, scoring="accuracy")

In [None]:
#Confusion Matrix: Precision & Recall

from sklearn.model_selection import cross_val_predict

Y_train_pred = cross_val_predict(sgd_clf, X_train, Y_train_5, cv=3)

from sklearn.metrics import confusion_matrix

confusion_matrix(Y_train_5, Y_train_pred)

In [None]:
#Precision & Recall for all trains

Y_scores = cross_val_predict(sgd_clf, X_train, Y_train_5, cv=3, method="decision_function")

from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(Y_train_5, Y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="center left")
    plt.axis([-700000,700000,0,1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
# Scores (Precision & Recall) by thresholds

from sklearn.metrics import precision_score, recall_score

Y_train_predic_90 = (Y_scores > 70000)

precision_score(Y_train_5, Y_train_predic_90)

In [None]:
recall_score(Y_train_5, Y_train_predic_90)

In [None]:
#ROC curve

from sklearn.metrics import roc_curve

fpr, tpr, threshold = roc_curve(Y_train_5, Y_scores)

def plot_roc_curve(fpr,tpr,label=None):
    plt.plot(fpr,tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1],'k--')
    plt.axis([0,1,0,1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    
plot_roc_curve(fpr,tpr)
plt.show()

In [None]:
#Area Under Curve

from sklearn.metrics import roc_auc_score

roc_auc_score(Y_train_5, Y_scores)

In [None]:
# Multiclass classifier  

sgd_clf.fit(X_train, Y_train)
sgd_clf.predict([some_digit])

In [None]:
# It trained 10 binary classifier for each number

some_digit_scores=sgd_clf.decision_function([some_digit])
some_digit_scores

In [None]:
np.argmax(some_digit_scores)

In [None]:
sgd_clf.classes_

In [None]:
#Random Forest Classifier (multi-class classifier)

from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train,Y_train)
forest_clf.predict([some_digit])

In [None]:
forest_clf.predict_proba([some_digit])

In [None]:
cross_val_score(sgd_clf, X_train, Y_train, cv=3, scoring="accuracy")

In [None]:
cross_val_score(forest_clf, X_train, Y_train, cv=3, scoring="accuracy")

In [None]:
# Scaling inputs

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
X_train_scale = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scale, Y_train, cv=3, scoring="accuracy")

In [None]:
  #Scaling doesn't help RandomForest
cross_val_score(forest_clf, X_train_scale, Y_train, cv=3, scoring="accuracy")

In [None]:
#Errors for multiclass
Y_train_pred = cross_val_predict(sgd_clf, X_train_scale, Y_train, cv=3)
conf_mx = confusion_matrix(Y_train, Y_train_pred)
conf_mx

In [None]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
row_sums

In [None]:
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx,0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
# multilabel classification

from sklearn.neighbors import KNeighborsClassifier

Y_train_large = (Y_train >=7)
Y_train_odd = (Y_train%2==1)
Y_multilabel = np.c_[Y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit( X_train, Y_multilabel)

knn_clf.predict([some_digit])