In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "ensembles"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

Fetching MNIST dataset

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

In [3]:
X = mnist['data']
y = mnist['target'].T

In [4]:
from sklearn.model_selection import train_test_split


X_train_dev, X_test, y_train_dev, y_test = train_test_split(X, y, train_size=60000, stratify=y, random_state=13)
X_train, X_dev, y_train, y_dev = train_test_split(X_train_dev, y_train_dev, train_size=50000, stratify=y_train_dev, random_state=13)

In [41]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

svc = SVC(C=0.01, kernel='poly', gamma='auto', verbose=3)
rfc = RandomForestClassifier(max_depth= 500, n_estimators= 100, verbose=3)
etc = ExtraTreesClassifier(max_depth= 50, n_estimators= 100, verbose=3)

In [None]:
from sklearn.ensemble import VotingClassifier

hard_vote = VotingClassifier(estimators=[('svc',svc),('rfc',rfc),('etc',etc)], voting='hard')
hard_vote.fit(X_train, y_train)

[LibSVM]

# Single Classifier performance

In [None]:
svc = SVC(C=0.01, kernel='poly', gamma='auto')
rfc = RandomForestClassifier(max_depth= 500, n_estimators= 100)
etc = ExtraTreesClassifier(max_depth= 50, n_estimators= 100)

In [25]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

sv_params = {
    'C': 0.01,
    'kernel': 'poly',
    'gamma': 'auto'
}

#sv_search = GridSearchCV(svc, sv_params, scoring='explained_variance', verbose=5, n_jobs=-1)
#sv_search.fit(X_train, y_train)
sv_scores = cross_val_score(svc, X_train, y_train, scoring='accuracy', verbose=5, n_jobs=3)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 out of   5 | elapsed:  6.1min remaining:  9.1min
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed: 11.3min finished


In [10]:
sv_scores

array([0.94311025, 0.94261558, 0.94398849, 0.94472696, 0.94901882])

In [26]:
rf_params = {
    'n_estimators': [80, 100],
    'max_depth': [50, 100, 500]
}

rf_search = GridSearchCV(rfc, rf_params, scoring='accuracy', verbose=5, n_jobs=-1)
rf_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  5.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  5.6min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              ra

## Results:


In [33]:
rf_search.cv_results_

{'mean_fit_time': array([37.03110142, 46.07674499, 36.79545283, 45.98899193, 36.79847755,
        44.70077114]),
 'mean_score_time': array([0.68771267, 0.79062433, 0.64062462, 0.75312471, 0.62500072,
        0.69086728]),
 'mean_test_score': array([0.96472, 0.96586, 0.96538, 0.9658 , 0.96478, 0.9665 ]),
 'param_max_depth': masked_array(data=[50, 50, 100, 100, 500, 500],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[80, 100, 80, 100, 80, 100],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 50, 'n_estimators': 80},
  {'max_depth': 50, 'n_estimators': 100},
  {'max_depth': 100, 'n_estimators': 80},
  {'max_depth': 100, 'n_estimators': 100},
  {'max_depth': 500, 'n_estimators': 80},
  {'max_depth': 500, 'n_estimators': 100}],
 'rank_test_score': array([6, 2, 4, 3, 5, 1]),
 'split0_test_sc

### Random Forest results

``
{'mean_fit_time': array([37.03110142, 46.07674499, 36.79545283, 45.98899193, 36.79847755,
        44.70077114]),
 'mean_score_time': array([0.68771267, 0.79062433, 0.64062462, 0.75312471, 0.62500072,
        0.69086728]),
 'mean_test_score': array([0.96472, 0.96586, 0.96538, 0.9658 , 0.96478, 0.9665 ]),
 'param_max_depth': masked_array(data=[50, 50, 100, 100, 500, 500],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[80, 100, 80, 100, 80, 100],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 50, 'n_estimators': 80},
  {'max_depth': 50, 'n_estimators': 100},
  {'max_depth': 100, 'n_estimators': 80},
  {'max_depth': 100, 'n_estimators': 100},
  {'max_depth': 500, 'n_estimators': 80},
  {'max_depth': 500, 'n_estimators': 100}],
 'rank_test_score': array([6, 2, 4, 3, 5, 1]),
 'split0_test_score': array([0.9642, 0.9653, 0.9656, 0.9658, 0.9638, 0.9655]),
 'split1_test_score': array([0.9649, 0.9663, 0.9644, 0.9655, 0.9648, 0.968 ]),
 'split2_test_score': array([0.9657, 0.9661, 0.9676, 0.9667, 0.9669, 0.9667]),
 'split3_test_score': array([0.9625, 0.9641, 0.9629, 0.9637, 0.9629, 0.9648]),
 'split4_test_score': array([0.9663, 0.9675, 0.9664, 0.9673, 0.9655, 0.9675]),
 'std_fit_time': array([0.09219733, 0.08513294, 0.20909355, 0.22844071, 0.1843486 ,
        1.35948552]),
 'std_score_time': array([0.05071869, 0.05642338, 0.03562992, 0.0302981 , 0.03423336,
        0.04000423]),
 'std_test_score': array([0.00131818, 0.00112712, 0.00162037, 0.00122963, 0.00137899,
        0.00119833])}
``

In [27]:
et_params = {
    'n_estimators': [80, 100],
    'max_depth': [50, 100, 500]
}

et_search = GridSearchCV(etc, et_params, scoring='accuracy', verbose=5, n_jobs=-1)
et_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  7.5min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                            class_weight=None, criterion='gini',
                                            max_depth=None, max_features='auto',
                                            max_leaf_nodes=None,
                                            max_samples=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100, n_jobs=None,
                                            oob_score=False, random_state=None,
                                            verbose=0, warm_start=False),
             iid='deprec

In [32]:
et_search.cv_results_

{'mean_fit_time': array([49.60879602, 61.71365976, 49.88942537, 62.16636343, 50.71077738,
        62.59396634]),
 'mean_score_time': array([0.74448681, 0.8501236 , 0.73153663, 0.85000038, 0.70165071,
        0.77920132]),
 'mean_test_score': array([0.96824, 0.96996, 0.96784, 0.9695 , 0.96838, 0.969  ]),
 'param_max_depth': masked_array(data=[50, 50, 100, 100, 500, 500],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[80, 100, 80, 100, 80, 100],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 50, 'n_estimators': 80},
  {'max_depth': 50, 'n_estimators': 100},
  {'max_depth': 100, 'n_estimators': 80},
  {'max_depth': 100, 'n_estimators': 100},
  {'max_depth': 500, 'n_estimators': 80},
  {'max_depth': 500, 'n_estimators': 100}],
 'rank_test_score': array([5, 1, 6, 2, 4, 3]),
 'split0_test_sc

### Extra trees results

``
{'mean_fit_time': array([49.60879602, 61.71365976, 49.88942537, 62.16636343, 50.71077738,
        62.59396634]),
 'mean_score_time': array([0.74448681, 0.8501236 , 0.73153663, 0.85000038, 0.70165071,
        0.77920132]),
 'mean_test_score': array([0.96824, 0.96996, 0.96784, 0.9695 , 0.96838, 0.969  ]),
 'param_max_depth': masked_array(data=[50, 50, 100, 100, 500, 500],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_n_estimators': masked_array(data=[80, 100, 80, 100, 80, 100],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 50, 'n_estimators': 80},
  {'max_depth': 50, 'n_estimators': 100},
  {'max_depth': 100, 'n_estimators': 80},
  {'max_depth': 100, 'n_estimators': 100},
  {'max_depth': 500, 'n_estimators': 80},
  {'max_depth': 500, 'n_estimators': 100}],
 'rank_test_score': array([5, 1, 6, 2, 4, 3]),
 'split0_test_score': array([0.9678, 0.969 , 0.9675, 0.9682, 0.9681, 0.9676]),
 'split1_test_score': array([0.9688, 0.9699, 0.9684, 0.9692, 0.9674, 0.9678]),
 'split2_test_score': array([0.9689, 0.9696, 0.9683, 0.9705, 0.9696, 0.9697]),
 'split3_test_score': array([0.9671, 0.9701, 0.9671, 0.9686, 0.9666, 0.9691]),
 'split4_test_score': array([0.9686, 0.9712, 0.9679, 0.971 , 0.9702, 0.9708]),
 'std_fit_time': array([0.16218854, 0.31503446, 0.44230415, 0.3279147 , 1.33778482,
        3.50093184]),
 'std_score_time': array([0.0500954 , 0.03626401, 0.05759633, 0.05191592, 0.04761933,
        0.04978237]),
 'std_test_score': array([0.00068877, 0.00072277, 0.00048826, 0.00108074, 0.00134224,
        0.00119499])}
``