In [1]:
import numpy as np
import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
# download dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings

mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
mnist['data'].shape, mnist['target'].shape

((70000, 784), (70000,))

In [12]:
X_train, X_val, X_test, y_train, y_val, y_test = \
    mnist['data'][:50000], mnist['data'][50000:60000], mnist['data'][60000:], \
    mnist['target'][:50000], mnist['target'][50000:60000], mnist['target'][60000:]

### Train a Random Forest, an Extra Tree, and a SVM Model

##### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_jobs=-1, random_state=0)

In [14]:
forest.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [17]:
y_hat1 = forest.predict(X_val)

In [19]:
from sklearn.metrics import accuracy_score
accuracy1 = accuracy_score(y_hat1, y_val)
accuracy1

0.9733

##### Extra Tree

In [22]:
from sklearn.ensemble import ExtraTreesClassifier
ex_tree = ExtraTreesClassifier(n_jobs=-1, random_state=0)

In [23]:
ex_tree.fit(X_train,y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=0, verbose=0,
                     warm_start=False)

In [24]:
y_hat2 = ex_tree.predict(X_val)
accuracy2 = accuracy_score(y_hat2, y_val)
accuracy2

0.9737

##### SVM

In [26]:
from sklearn.svm import SVC
svc = SVC(probability=True,random_state=0)

In [27]:
svc.fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [28]:
y_hat3 = svc.predict(X_val)
accuracy3 = accuracy_score(y_hat3, y_val)
accuracy3

0.9802

##### Voting

In [30]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('forest',forest),('ex_tree',ex_tree),('svc',svc)],voting='soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('forest',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
    

In [31]:
y_hat4 = voting_clf.predict(X_val)
accuracy4 = accuracy_score(y_hat4, y_val)
accuracy4

0.9814

In [66]:
# see the score of a hard voting
voting_clf.voting = 'hard'
voting_clf.score(X_val, y_val)

0.9775

##### Blender (stacking)

In [52]:
y_hats = np.c_[y_hat1, y_hat2, y_hat3]

blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)

In [53]:
blender.fit(y_hats, y_val)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [55]:
y_pred = blender.predict(y_hats)

In [57]:
blender.oob_score_

0.9751

##### Use blender on test data

In [59]:
estimators = [forest, ex_tree, svc]

X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

In [60]:
X_test_predictions.shape

(10000, 3)

In [64]:
for index, estimator in enumerate(estimators):
    X_test_predictions[:,index] = estimator.predict(X_test)

In [65]:
y_test_pred = blender.predict(X_test_predictions)
accuracy_score(y_test_pred, y_test)

0.9746