In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
np.random.seed(42)
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) 
from sklearn.metrics import accuracy_score

In [2]:
X, y = mnist['data'], mnist['target']
X.shape, y.shape

((70000, 784), (70000,))

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = X[:50000], X[50000:60000], X[60000:], y[:50000], y[50000:60000], y[60000:]
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((50000, 784), (10000, 784), (10000, 784), (50000,), (10000,), (10000,))

In [4]:
shuffle_index = np.random.permutation(50000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [5]:
# KNN Model
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_jobs=-1, weights='distance', n_neighbors=4)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
                     weights='distance')

In [10]:
y_train_knn_pred = knn_clf.predict(X_val)
accuracy_score(y_val, y_train_knn_pred)

0.9741

In [8]:
# SVC Model
from sklearn.svm import SVC
svc_clf = SVC()
svc_clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [9]:
y_train_svc_pred = svc_clf.predict(X_val)
accuracy_score(y_val, y_train_svc_pred)

0.9802

In [12]:
# SGD Model
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state = 0)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=0, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [13]:
y_train_sgd_pred = sgd_clf.predict(X_val)
accuracy_score(y_val, y_train_sgd_pred)

0.8685

In [15]:
# Random Forest Model
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
param_grid  ={
    'n_estimators': [50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [3,5,7,9],
    'criterion' :['gini', 'entropy']
}

rfc = RandomForestClassifier(random_state=0)
rfc_clf = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
rfc_clf.fit(X_train, y_train)

KeyboardInterrupt: 