In [10]:
import requests

try:
    from tqdm import tqdm
except ImportError:
    tqdm = lambda x, total, unit: x  # If tqdm doesn't exist, replace it with a function that does nothing
    print('**** Could not import tqdm. Please install tqdm for download progressbars! (pip install tqdm) ****')

# Python2 compatibility
try:
    input = raw_input
except NameError:
    pass

download_dict = {
    '1) Kuzushiji-MNIST (10 classes, 28x28, 70k examples)': {
        '1) MNIST data format (ubyte.gz)':
            ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz'],
        '2) NumPy data format (.npz)':
            ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz'],
    },
    '2) Kuzushiji-49 (49 classes, 28x28, 270k examples)': {
        '1) NumPy data format (.npz)':
            ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz',
            'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz'],
    },
    '3) Kuzushiji-Kanji (3832 classes, 64x64, 140k examples)': {
        '1) Folders of images (.tar)':
            ['http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar'],
    }

}

# Download a list of files
def download_list(url_list):
    for url in url_list:
        path = url.split('/')[-1]
        r = requests.get(url, stream=True)
        with open(path, 'wb') as f:
            total_length = int(r.headers.get('content-length'))
            print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))

            for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit="KB"):
                if chunk:
                    f.write(chunk)
    print('All dataset files downloaded!')

# Ask the user about which path to take down the dict
def traverse_dict(d):
    print('Please select a download option:')
    keys = sorted(d.keys())  # Print download options
    for key in keys:
        print(key)

    userinput = input('> ').strip()

    try:
        selection = int(userinput) - 1
    except ValueError:
        print('Your selection was not valid')
        traverse_dict(d)  # Try again if input was not valid
        return

    selected = keys[selection]

    next_level = d[selected]
    if isinstance(next_level, list):  # If we've hit a list of downloads, download that list
        download_list(next_level)
    else:
        traverse_dict(next_level)     # Otherwise, repeat with the next level

traverse_dict(download_dict)

Please select a download option:
1) Kuzushiji-MNIST (10 classes, 28x28, 70k examples)
2) Kuzushiji-49 (49 classes, 28x28, 270k examples)
3) Kuzushiji-Kanji (3832 classes, 64x64, 140k examples)


>  1


Please select a download option:
1) MNIST data format (ubyte.gz)
2) NumPy data format (.npz)


>  2


Downloading kmnist-train-imgs.npz - 18.0 MB


100%|██████████| 17954/17954 [00:09<00:00, 1880.36KB/s]


Downloading kmnist-train-labels.npz - 0.0 MB


100%|██████████| 30/30 [00:00<00:00, 263.09KB/s]


Downloading kmnist-test-imgs.npz - 3.0 MB


100%|██████████| 3008/3008 [00:02<00:00, 1340.81KB/s]


Downloading kmnist-test-labels.npz - 0.0 MB


100%|██████████| 6/6 [00:00<?, ?KB/s]

All dataset files downloaded!





In [11]:
#Specify the directory for input data and output pickles to save the model parameters
#see comments for more detail on pickes in the last cell.. 
datadir='./'

#dataset = 1 for KMNIST, 2 for K49
dataset = 1

#Dataset 2(K49) may take longer computation time than one day).
#You may test with small dataset by setting the following variable to 'True'
small_dataset = False

#if the above parameter is set to be 'True', then only the following number of instances will be used for training.
#number of training data in original dataset 1 and 2 is 60000 and 232365 respectively.
number_of_instances_for_small_dataset = 1000 

#multi-class decision function. While an 'ovr' option is fast. an'ovo' optionis very slow, but, it classifties better for unbalanced numbers of instances among classes.  
decision_function_shape="ovr"

# Make sure that the following files are set in 'datadir' directory.

if dataset == 1:
    #Dataset 1 KMNNIST 10
    filename_train_img   = "kmnist-train-imgs"
    filename_train_label = "kmnist-train-labels"
    filename_test_img    = "kmnist-test-imgs"
    filename_test_label  = "kmnist-test-labels"
    pickles_id = "data1"
elif dataset == 2:
    #Dataset 2 K49
    filename_train_img   = "k49-train-imgs"
    filename_train_label = "k49-train-labels"
    filename_test_img    = "k49-test-imgs"
    filename_test_label  = "k49-test-labels"
    pickles_id = "data2"

In [12]:
import numpy as np
import os

np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pickle
from sklearn.svm import SVC
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

In [13]:
################################
### Data Input
################################

X_train = np.load(datadir+filename_train_img+'.npz')['arr_0']
y_train = np.load(datadir+filename_train_label+'.npz')['arr_0']
X_test = np.load(datadir+filename_test_img+'.npz')['arr_0']
y_test = np.load(datadir+filename_test_label+'.npz')['arr_0']


X_train = np.reshape(X_train,(-1,28*28))
X_test  = np.reshape(X_test, (-1,28*28))


ntrain = len(X_train)
ntest = len(X_test)

np.random.seed(42)
rnd_idx = np.random.permutation(ntrain)
X_train = X_train[rnd_idx]
y_train = y_train[rnd_idx]

X_train_rs = X_train[:1000]
y_train_rs = y_train[:1000]

if small_dataset:
    X_train = X_train[:number_of_instances_for_small_dataset]
    y_train = y_train[:number_of_instances_for_small_dataset]

#scaler = StandardScaler()
X_train_scaled = X_train/255
X_test_scaled = X_test/255

X_train_scaled_rs = X_train_rs/255

print (len(X_train),len(y_train))

60000 60000


In [14]:
################################
### Random Search to find best 'C' and 'gamma'　
### (This process can be skipped. These parameters are given in the next cell.
################################ 

svm_clf = SVC(decision_function_shape=decision_function_shape)
param_distributions = {"gamma": reciprocal(0.001, 0.1), "C": uniform(1, 10)}
rnd_search_cv = RandomizedSearchCV(svm_clf, param_distributions, n_iter=10, verbose=2)
rnd_search_cv.fit(X_train_scaled_rs, y_train_rs)
print (rnd_search_cv.best_estimator_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END ...C=3.5988934774124326, gamma=0.001513186272679838; total time=   0.1s
[CV] END ...C=3.5988934774124326, gamma=0.001513186272679838; total time=   0.1s
[CV] END ...C=3.5988934774124326, gamma=0.001513186272679838; total time=   0.0s
[CV] END ...C=3.5988934774124326, gamma=0.001513186272679838; total time=   0.1s
[CV] END ...C=3.5988934774124326, gamma=0.001513186272679838; total time=   0.1s
[CV] END .....C=3.055855399572819, gamma=0.06719156480223124; total time=   0.2s
[CV] END .....C=3.055855399572819, gamma=0.06719156480223124; total time=   0.2s
[CV] END .....C=3.055855399572819, gamma=0.06719156480223124; total time=   0.2s
[CV] END .....C=3.055855399572819, gamma=0.06719156480223124; total time=   0.2s
[CV] END .....C=3.055855399572819, gamma=0.06719156480223124; total time=   0.2s
[CV] END ....C=1.107005900704854, gamma=0.001848939794318145; total time=   0.1s
[CV] END ....C=1.107005900704854, gamma=0.001848

In [15]:
################################
### SVM Definition with given parameters (C, gamma)
################################ 
if dataset == 1:
    #Dataset 1 KMNNIST 10
    C = 10.572971700825061
    gamma = 0.01381957639278581
elif dataset == 2:
    #Dataset 2 K49
    C = 4.152705171689228
    gamma = 0.006783091541660457
    
svm_clf = SVC(C=C, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=decision_function_shape, degree=3, gamma=gamma,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [16]:
#SVM Training (fitting)
svm_clf.fit(X_train_scaled, y_train)
print ('fitting done')
with open(datadir+'Kuzushi_SVC_class_after_fitting_'+pickles_id+'.pickle', mode='wb') as f:
    pickle.dump(svm_clf,f)

#SVM Prediction for the training dataset    
y_pred = svm_clf.predict(X_train_scaled)
with open(datadir+'Kuzushi_y_predicted_for_training_data_'+pickles_id+'.pickle', mode='wb') as f2:
    pickle.dump(y_pred,f2)
acc_train=accuracy_score(y_train, y_pred)
print ('accuracy on training data(not class average)',acc_train)

#SVM Prediction for the test dataset
y_pred_test = svm_clf.predict(X_test_scaled)
with open(datadir+'Kuzushi_y_predicted_for_test_data_'+pickles_id+'.pickle', mode='wb') as f3:
    pickle.dump(y_pred_test,f3)
acc_test=accuracy_score(y_test, y_pred_test)
print ('accuracy on test data(not class average)',acc_test)

fitting done
accuracy on training data(not class average) 1.0
accuracy on test data(not class average) 0.929


In [17]:
if dataset==2:
    accuracy_train = 0
    accuracy_test = 0
    for i in range(0,49):
        y_index_train = np.where(y_train==i)
        y_index_test = np.where(y_test==i)
        acc_train=accuracy_score(y_train[y_index_train], y_pred[y_index_train])
        acc_test=accuracy_score(y_test[y_index_test], y_pred_test[y_index_test])
        accuracy_train+=acc_train/49
        accuracy_test+=acc_test/49
    print ('accuracy on training data(class averaged)',accuracy_train)
    print ('accuracy on test data(class averaged)',accuracy_test)

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

rforest_clf = RandomForestClassifier(n_estimators=300, criterion='gini', random_state=0)


In [38]:
#Random Forest Training (fitting)
rforest_clf.fit(X_train_scaled, y_train)
print ('fitting done')

fitting done


In [39]:
with open(datadir+'Kuzushi_random_forest_class_after_fitting_'+pickles_id+'.pickle', mode='wb') as f4:
    pickle.dump(rforest_clf,f4)

#SVM Prediction for the training dataset    
y_pred = rforest_clf.predict(X_train_scaled)
with open(datadir+'Kuzushi_y_predicted_for_training_data_'+pickles_id+'.pickle', mode='wb') as f5:
    pickle.dump(y_pred,f5)
acc_train=accuracy_score(y_train, y_pred)
print ('accuracy on training data(not class average)',acc_train)

#SVM Prediction for the test dataset
y_pred_test = rforest_clf.predict(X_test_scaled)
with open(datadir+'Kuzushi_y_predicted_for_test_data_'+pickles_id+'.pickle', mode='wb') as f6:
    pickle.dump(y_pred_test,f6)
acc_test=accuracy_score(y_test, y_pred_test)
print ('accuracy on test data(not class average)',acc_test)

accuracy on training data(not class average) 1.0
accuracy on test data(not class average) 0.8623


In [57]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_clf = AdaBoostClassifier(n_estimators=100, learning_rate= 1, random_state=0)

In [58]:
#Adaboost Training (fitting)
adaboost_clf.fit(X_train_scaled, y_train)
print ('fitting done')

fitting done


In [59]:
with open(datadir+'Kuzushi_adaboost_class_after_fitting_'+pickles_id+'.pickle', mode='wb') as f7:
    pickle.dump(adaboost_clf,f7)

#SVM Prediction for the training dataset    
y_pred = adaboost_clf.predict(X_train_scaled)
with open(datadir+'Kuzushi_y_predicted_for_training_data_'+pickles_id+'.pickle', mode='wb') as f8:
    pickle.dump(y_pred,f8)
acc_train=accuracy_score(y_train, y_pred)
print ('accuracy on training data(not class average)',acc_train)

#SVM Prediction for the test dataset
y_pred_test = adaboost_clf.predict(X_test_scaled)
with open(datadir+'Kuzushi_y_predicted_for_test_data_'+pickles_id+'.pickle', mode='wb') as f9:
    pickle.dump(y_pred_test,f9)
acc_test=accuracy_score(y_test, y_pred_test)
print ('accuracy on test data(not class average)',acc_test)

accuracy on training data(not class average) 0.6052
accuracy on test data(not class average) 0.5216


In [68]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=3)

In [69]:
#Adaboost Training (fitting)
knn_clf.fit(X_train_scaled, y_train)
print ('fitting done')

fitting done


In [71]:
with open(datadir+'Kuzushi_knn_class_after_fitting_'+pickles_id+'.pickle', mode='wb') as f10:
    pickle.dump(knn_clf,f10)

#SVM Prediction for the training dataset    
y_pred = adaboost_clf.predict(X_train_scaled)
with open(datadir+'Kuzushi_y_predicted_for_training_data_'+pickles_id+'.pickle', mode='wb') as f11:
    pickle.dump(y_pred,f11)
acc_train=accuracy_score(y_train, y_pred)
print ('accuracy on training data(not class average)',acc_train)

#SVM Prediction for the test dataset
y_pred_test = knn_clf.predict(X_test_scaled)
with open(datadir+'Kuzushi_y_predicted_for_test_data_'+pickles_id+'.pickle', mode='wb') as f12:
    pickle.dump(y_pred_test,f12)
acc_test=accuracy_score(y_test, y_pred_test)
print ('accuracy on test data(not class average)',acc_test)

accuracy on training data(not class average) 0.6052
accuracy on test data(not class average) 0.9143
