In [1]:
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.base import clone
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

import utils

In [2]:
RANDOM_SEED = 42

In [3]:
(x_train, y_train), (x_test, y_test) = utils.load_mnist_5()
print(f"x_train.shape = {x_train.shape}, y_train.shape = {y_train.shape}")
print(f"x_test.shape = {x_test.shape}, y_test.shape = {y_test.shape}")

x_train.shape = (60000, 784), y_train.shape = (60000,)
x_test.shape = (10000, 784), y_test.shape = (10000,)


In [4]:
sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=RANDOM_SEED)
sgd_clf.fit(x_train, y_train)

SGDClassifier(random_state=42)

In [5]:
cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring='accuracy')

array([0.95035, 0.96035, 0.9604 ])

In [6]:
skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for train_index, test_index in skfolds.split(x_train, y_train):
    clone_clf = clone(sgd_clf)
    x_train_fold, y_train_fold = x_train[train_index], y_train[train_index]
    x_test_fold, y_test_fold = x_train[test_index], y_train[test_index]

    clone_clf.fit(x_train_fold, y_train_fold)
    y_pred = clone_clf.predict(x_test_fold)
    num_correct = sum(y_pred == y_test_fold)
    print(num_correct / len(y_pred))

0.9669
0.91625
0.96785


In [7]:
class NeverTrueClassifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [8]:
never_true_clf = NeverTrueClassifier()
cross_val_score(never_true_clf, x_train, y_train, cv=3, scoring='accuracy')

array([0.91125, 0.90855, 0.90915])