# MNIST Random Forest, Extra-Trees, SVM and MLP Classifier Ensemble

## Create Train, Val, Test sets

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)

X_train_val, X_test, y_train_val, y_test = train_test_split(mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val,y_train_val, test_size=10000, random_state=42)

## Random Forest Classifier

In [3]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [4]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf] 

for estimator in estimators:
  print("Training ", estimator)
  estimator.fit(X_train, y_train)

Training  RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
Training  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_



Training  MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)


In [16]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9692, 0.9715, 0.8397, 0.9639]

## Create a VoteClassifier Ensemble from the above estimators

In [7]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
                    ("random_forest_clf", random_forest_clf),
                    ("extra_trees_clf", extra_trees_clf),
                    ("svm_clf", svm_clf),
                    ("mlp_clf", mlp_clf),
]

voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('random_forest_clf',
                              RandomForestClassifier(bootstrap=True,
                                                     ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     max_samples=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_lea

In [8]:
voting_clf.score(X_val, y_val)

0.9706

### Seeing as it's only a minor improvement and the SVM model's accuracy is significantly lower than the rest, let's remove it and try again

In [9]:
voting_clf.set_params(svm_clf=None)

# remove svm classifier from list of estimators
del voting_clf.estimators_[2]

#re-evaluate without svm
voting_clf.score(X_val, y_val)

0.9736

In [18]:
voting_clf.estimators_

[RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=None, max_features='auto',
                        max_leaf_nodes=None, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                      criterion='gini', max_depth=None, max_features='auto',
                      max_leaf_nodes=None, max_samples=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs

### Let's now try with soft voting rather than hard (default), as we do not to retrain to check this, it is always worth trying this step


In [11]:
voting_clf.voting = "soft"
voting_clf.score(X_val, y_val)

0.97

## Test set accuracy 

In [19]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9704

### Compared to each individual estimator's performance:


In [21]:
[estimator.score(X_test, y_test) for estimator in estimators]

[0.9645, 0.9691, 0.8449, 0.9604]