# Chapter 7 Exercises

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

def save_fig(fig_id, tight_layout=True):
    path = os.path.join("images", fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

Load the MNIST data (introduced in Chapter 3), and split it into a training set, a validation set, and a test set (e.g., use the first 40,000 instances for training, the next 10,000 for validation, and the last 10,000 for testing). Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM. Next, try to combine them into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

In [24]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original')

In [25]:
X, y = mnist["data"], mnist["target"]

In [33]:
X_train, X_val, X_test, y_train, y_val, y_test = X[:60000], X[60000:65000], X[65000:70000], y[:60000], y[60000:65000], y[65000:70000]

In [34]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=3)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='distance')

In [37]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)
forest_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [38]:
from sklearn.svm import LinearSVC
lin_clf = LinearSVC(random_state=42)
lin_clf.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)

In [39]:
from sklearn.metrics import accuracy_score

y_pred_knn = knn_clf.predict(X_test)
y_pred_lin = lin_clf.predict(X_test)
y_pred_forest = forest_clf.predict(X_test)
print(accuracy_score(y_test, y_pred_knn))
print(accuracy_score(y_test, y_pred_lin))
print(accuracy_score(y_test, y_pred_forest))

0.9652
0.852
0.9298


In [40]:
from sklearn.model_selection import cross_val_score
cross_val_score(knn_clf, X_val, y_val, cv=3, scoring="accuracy")

array([ 0.97182254,  0.98020396,  0.97957958])

In [41]:
cross_val_score(forest_clf, X_val, y_val, cv=3, scoring="accuracy")

array([ 0.96223022,  0.96760648,  0.95975976])

In [42]:
cross_val_score(lin_clf, X_val, y_val, cv=3, scoring="accuracy")

array([ 0.93645084,  0.95140972,  0.94714715])

In [43]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('lin', lin_clf), ('forest', forest_clf), ('knn', knn_clf)],
    voting='hard')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lin', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0)), ('forest', RandomForestClassifier(bootstrap=True, class_weight=No...ski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='distance'))],
         n_jobs=1, voting='hard', weights=None)

In [70]:
y_pred_voting = voting_clf.predict(X_test)
accuracy_score(y_test, y_pred_voting)

0.9476

In [45]:
cross_val_score(voting_clf, X_val, y_val, cv=3, scoring="accuracy")

array([ 0.97961631,  0.9760048 ,  0.97657658])

Run the individual classifiers from the previous exercise to make predictions on the validation set, and create a new training set with the resulting predictions: each training instance is a vector containing the set of predictions from all your classifiers for an image, and the target is the image’s class. Congratulations, you have just trained a blender, and together with the classifiers they form a stacking ensemble! Now let’s evaluate the ensemble on the test set. For each image in the test set, make predictions with all your classifiers, then feed the predictions to the blender to get the ensemble’s predictions. How does it compare to the voting classifier you trained earlier?

In [48]:
y_pred_val_knn = knn_clf.predict(X_val)
y_pred_val_lin = lin_clf.predict(X_val)
y_pred_val_forest = forest_clf.predict(X_val)

In [63]:
X_val.shape

(5000, 784)

In [53]:
X_new = np.zeros((5000, 3))

In [58]:
tmp_new = X_new.tolist()
tmp_knn = y_pred_val_knn.tolist()
tmp_lin = y_pred_val_lin.tolist()
tmp_forest = y_pred_val_forest.tolist()

In [68]:
for i in range(len(tmp_new)):
    tmp_new[i][0] = tmp_knn[i]
    tmp_new[i][1] = tmp_lin[i]
    tmp_new[i][2] = tmp_forest[i]
    
#tmp_new

In [66]:
X_train_new = np.array(tmp_new)

In [69]:
knn_blender = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=3)
knn_blender.fit(X_train_new, y_val)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=3, p=2,
           weights='distance')

In [None]:
blender_preds = []

for image in X_test:
    pred_knn = knn_clf.predict(image)
    pred_lin = lin_clf.predict(image)
    pred_forest = forest_clf.predict(image)
    tmp_preds = np.array([pred_knn.item(0), pred_lin.item(0), pred_forest.item(0)])
    
    pred_blender = knn_blender.predict(tmp_preds.reshape(-1,1))
    blender_preds.append(pred_blender)
    
print('Finished')

























In [105]:
y_pred_blender = np.array(blender_preds)
accuracy_score(y_test, y_pred_blender)

AttributeError: 'float' object has no attribute 'item'

In [104]:
y_pred_blender

array([ 3.,  4.,  4., ...,  4.,  4.,  4.])

In [100]:
y_test

array([ 4.,  4.,  4., ...,  9.,  9.,  9.])