Load the MNIST data,and split it in to a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing). Then train various classifiers, such as a Random Forest classifier, an Extra-Trees classifier, and an SVM classifier. Next, try to combine them into an ensemble that outperforms each individual classifier on the validation set, using soft or hard voting. Once you have found one, try it on the test set. How much better does it perform compared to the individual classifiers?

In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [2]:
X, y = mnist["data"], mnist["target"]

X.shape

(70000, 784)

In [3]:
X_train,X_val,X_test,y_train,y_val,y_test = X[:50000],X[50000:60000],X[60000:],y[:50000],y[50000:60000],y[60000:]

### RandomForest Classifier

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 

randomf_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42) 
randomf_clf.fit(X_train, y_train)

y_randomf_pred = randomf_clf.predict(X_val)
accuracy_score(y_val, y_randomf_pred)

0.8374

### Extra-Trees classifier

In [5]:
from sklearn.ensemble import ExtraTreesClassifier

extra_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42) 
extra_clf.fit(X_train, y_train)

y_extra_clf = extra_clf.predict(X_val)
accuracy_score(y_val, y_extra_clf)

0.8374

In [9]:
extra_clf.score(X_val, y_val)

0.8374

### SVM Classifier

In [6]:
from sklearn.svm import LinearSVC

In [7]:
svm_clf = LinearSVC(random_state=42)
svm_clf.fit(X_train, y_train)

y_svm_clf = svm_clf.predict(X_val)
accuracy_score(y_val, y_svm_clf)



0.8749

In [10]:
svm_clf.score(X_val, y_val)

0.8749

### Voting Classifier

In [23]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(
    estimators=[('extra', extra_clf), 
                ('randomf', randomf_clf), 
                ('svc', svm_clf)], voting='hard')

voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('extra',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=None,
                                                     max_features='auto',
                                                     max_leaf_nodes=16,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=500,
                                                     n_jobs=-1, oob_score=F

In [24]:
voting_clf.score(X_val, y_val)

0.8374

In [25]:
voting_clf.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=16,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=500,
                        n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                        max_depth=None, max_features='auto', max_leaf_nodes=16,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=500,
                        n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                        warm_start=False),
 LinearSVC(C=1