In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
np.random.seed(42)
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) 
from sklearn.metrics import accuracy_score

In [2]:
X, y = mnist['data'], mnist['target']
X.shape, y.shape

((70000, 784), (70000,))

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = X[:50000], X[50000:60000], X[60000:], y[:50000], y[50000:60000], y[60000:]
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((50000, 784), (10000, 784), (10000, 784), (50000,), (10000,), (10000,))

In [4]:
shuffle_index = np.random.permutation(50000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [5]:
# KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_jobs=12, weights='distance', n_neighbors=4)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(n_jobs=12, n_neighbors=4, weights='distance')

In [6]:
y_train_knn_pred = knn_clf.predict(X_val)
accuracy_score(y_val, y_train_knn_pred)

0.9741

In [None]:
# SVC Model
from sklearn.svm import SVC
svc_clf = SVC(probability=True, cache_size=2000)
svc_clf.fit(X_train, y_train)

In [None]:
y_train_svc_pred = svc_clf.predict(X_val)
accuracy_score(y_val, y_train_svc_pred)

In [7]:
# SGD Model
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state = 0, n_jobs=12)
sgd_clf.fit(X_train, y_train)

SGDClassifier(n_jobs=12, random_state=0)

In [8]:
y_train_sgd_pred = sgd_clf.predict(X_val)
accuracy_score(y_val, y_train_sgd_pred)

0.8685

In [10]:
# Random Forest Model#####
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_grid  ={
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3,5,7,9]
}

rfc = RandomForestClassifier(random_state=0)
rfc_g = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, verbose=2, n_jobs=12)
rfc_g.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  17 tasks      | elapsed:   31.4s
[Parallel(n_jobs=12)]: Done  80 out of  80 | elapsed:  5.0min finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=12,
             param_grid={'max_depth': [3, 5, 7, 9],
                         'n_estimators': [50, 100, 200, 500]},
             verbose=2)

In [11]:
print (rfc_g.best_score_)
print (rfc_g.best_params_)
print (rfc_g.best_estimator_)

0.93668
{'max_depth': 9, 'n_estimators': 500}
RandomForestClassifier(max_depth=9, n_estimators=500, random_state=0)


In [12]:
from sklearn.ensemble import RandomForestClassifier
rfc_clf = RandomForestClassifier(max_depth=9, n_estimators=500, n_jobs=12)
rfc_clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=9, n_estimators=500, n_jobs=12)

In [13]:
y_train_rfc_pred = rfc_clf.predict(X_val)
accuracy_score(y_val, y_train_rfc_pred)

0.9461

In [16]:
#Ensemble
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[('lin', lin_clf), ('k', knn_clf), ('rf', rfc_clf)], voting='soft', n_jobs=12)
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lin',
                              LogisticRegression(max_iter=50000, n_jobs=12)),
                             ('k',
                              KNeighborsClassifier(n_jobs=12, n_neighbors=4,
                                                   weights='distance')),
                             ('rf',
                              RandomForestClassifier(max_depth=9,
                                                     n_estimators=500,
                                                     n_jobs=12))],
                 n_jobs=12, voting='soft')

In [17]:
y_svote_rfc_pred = voting_clf.predict(X_val)
accuracy_score(y_val, y_svote_rfc_pred)

0.9676

In [14]:
#logistic Regression
from sklearn.linear_model import LogisticRegression
lin_clf = LogisticRegression(max_iter=50000, n_jobs=12)
lin_clf.fit(X_train, y_train)

LogisticRegression(max_iter=50000, n_jobs=12)

In [15]:
y_train_lin_pred = lin_clf.predict(X_val)
accuracy_score(y_val, y_train_lin_pred)

0.9215

In [24]:
y_train_knn_pred.shape

(10000,)

In [25]:
y_train_sgd_pred.shape

(10000,)

In [32]:
y_train_rfc_pred

array([3, 8, 6, ..., 5, 6, 8], dtype=int8)

In [37]:
# Blender
# blend = pd.DataFrame({'a': y_train_knn_pred, 'b': y_train_sgd_pred,'c':y_train_rfc_pred})

# rfc_blen_clf.fit(blend, y_val)

estimators = [rfc_clf, sgd_clf, knn_clf]
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [38]:
X_val_predictions

array([[3., 3., 3.],
       [8., 8., 8.],
       [6., 5., 6.],
       ...,
       [5., 5., 5.],
       [6., 6., 6.],
       [8., 8., 8.]], dtype=float32)

In [40]:
rfc_blen_clf = RandomForestClassifier(max_depth=9, n_estimators=500, n_jobs=12, oob_score=True)
rfc_blen_clf.fit(X_val_predictions, y_val)

RandomForestClassifier(max_depth=9, n_estimators=500, n_jobs=12, oob_score=True)

In [41]:
rfc_blen_clf.oob_score_

0.972

In [42]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [43]:
y_pred = rfc_blen_clf.predict(X_test_predictions)

In [44]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9671