Load the MNIST data (introduced in Chapter 3), and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for val‐ idation, and 10,000 for testing). Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM. Next, try to combine them into an ensemble that outperforms them all on the validation set, using a soft or hard voting classifier. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

In [2]:
from sklearn.datasets import fetch_openml

# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/7, random_state=42)
#X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=1/6, random_state=42)

In [12]:
len(X_train), len(X_validation), len(X_test)

(50000, 10000, 10000)

In [43]:
from sklearn.metrics import f1_score
def predict(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_predict = clf.predict(X_test)
    print(f1_score(y_test, y_predict, average='weighted'))
    return clf

In [42]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
predict(rfc, X_train, y_train, X_test, y_test)



0.9459936779061294


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Bagging

The BaggingClassifier automatically performs soft voting instead of hard voting if the base classifier can estimate class proba‐ bilities (i.e., if it has a predict_proba() method), which is the case with Decision Trees classifiers.

In [17]:
from sklearn.ensemble import BaggingClassifier 
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
        DecisionTreeClassifier(), n_estimators=500,
        max_samples=100, bootstrap=True, n_jobs=-1)
predict(bag_clf, X_train, y_train, X_test, y_test)

0.8427689844531249

## Adaboost

In [18]:
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
         DecisionTreeClassifier(max_depth=1), n_estimators=200,
         algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)
y_predict = ada_clf.predict(X_test)
f1_score(y_test, y_predict, average='weighted')

0.7837447093404468

In [36]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='log')
predict(sgd, X_train, y_train, X_test, y_test)



0.8784474569801785


SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [40]:
dtree = DecisionTreeClassifier()
predict(dtree, X_train, y_train, X_test, y_test)

0.864187038584018


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [39]:
from sklearn.svm import LinearSVC

svc = LinearSVC()
predict(svc, X_train, y_train, X_test, y_test)

0.849671001944649




LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [44]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=20)

weak_clf = []
for train_index, test_index in kf.split(X_train):
    sgd = SGDClassifier(loss='log')
    predict(sgd, X_train[train_index], y_train[train_index], X_train[test_index], y_train[test_index])
    weak_clf.append(sgd)



0.8580884795623198




0.8787283878855383




0.8681340803250192




0.8604479367033855




0.8601427325108849




0.8635918464558112




0.8844429247910122




0.8775951808454374




0.8718222192430303




0.86109077860234




0.8680034386821255




0.8439826786686971




0.8737222821139586




0.8632642916057256




0.861690453336784




0.8465368911774732




0.8714934940530147




0.8613434008973295




0.8590489164682068




0.8688419254384881


In [66]:
def predict_proba(weak_clf, X_test):
    for clf in weak_clf:
        yield np.matrix(clf.predict_proba(X_test))
        
import functools
import numpy as np
al = list(predict_proba(weak_clf, X_test))
#functools.reduce(np.sum, )


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
  prob /= prob.sum(axis=1).reshape((prob.shape[0

In [70]:
weak_clf[1].predict_proba(X_test)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])