In [1]:
import time
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import check_random_state

from rerf.rerfClassifier import rerfClassifier

from tqdm import tqdm

In [22]:
# Load data from https://www.openml.org/d/554
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

In [6]:
print(X.shape)

(70000, 784)


In [26]:
train_samples = 100

In [40]:
random_state = check_random_state(0)
permutation = random_state.permutation(X.shape[0])
X = X[permutation]
y = y[permutation].astype(int)
X = X.reshape((X.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=train_samples, test_size=10000)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#val_idx + test_idx
ncores=1
num_runs=1
n_est=500

feature_range = [10,30,60,100,150]
patch_max = [3,5,8,11]

In [18]:
clf = rerfClassifier(
            projection_matrix="S-RerF", 
            max_features = 60,
            n_estimators=n_est,
            n_jobs=ncores,
            image_height=28, 
            image_width=28, 
            patch_height_max=3,
            patch_width_max=3,
            patch_height_min=1,
            patch_width_min=1,
        )

In [None]:
params {'max_features':feature_range, 'patch_height_max': patch_max, 'patch_width_max':patch_max}]
skf = StratifiedShuffleSplit(n_splits=3)
    
gs = GridSearchCV(clf, params, cv=skf, verbose=1)
gs.fit(X_train, y_train)
clf_best = gs.best_estimator_

In [19]:
ns = [100]#, 200, 500, 1000, 4000, 8000]

In [41]:
write_path = f'mf_mnist.csv'

with open(write_path, 'w') as f:
    f.write("classifier,n,Lhat,trainTime,testTime,iterate\n")

#runList = [(n, clf, iterate) for n in ns                             for clf in zip(best_clfs, [key for key in names])
#                            for iterate in range(1, num_runs + 1)]

iterate = 1

for n in tqdm(ns):
    trainStartTime = time.time()
    clf.fit(X_train, y_train)
    trainEndTime = time.time()
    trainTime = trainEndTime - trainStartTime

    out = clf.predict(X_train)
    lhat = np.mean(np.not_equal(out, y_train).astype(int))
    print(np.bincount(out - y_train))

    print(lhat)
    
    testStartTime = time.time()
    out = clf.predict(X_test)
    testEndTime = time.time()
    testTime = testEndTime - testStartTime

    lhat = np.mean(np.not_equal(out, y_test).astype(int))
    
    print(lhat)

    ####("variable,Lhat,trainTime,testTime,iterate")
    #with open(write_path, 'a') as f:
    #    f.write(f"{clf[1]}, {n}, {lhat:2.9f}, {trainTime:2.9f}, {testTime:2.9f}, {iterate}\n")

  0%|          | 0/1 [00:00<?, ?it/s]

[0 3 8 3 5 4 8 3 8 3 8 4 0 9 1 6 4 0 7 7 1 1 0 9 4 5 8 4 1 5 4 9 5 7 4 4 0
 7 0 2 3 0 2 8 8 0 5 7 8 3 3 5 6 0 4 8 9 5 4 6 2 9 1 3 8 1 2 0 2 5 6 0 8 6
 5 3 6 3 8 5 9 8 3 6 2 2 8 5 2 3 2 8 1 6 1 1 1 4 3 6]
[100]
0.0


100%|██████████| 1/1 [00:03<00:00,  3.68s/it]

0.2967



