In [None]:
import os
import sys  
import pandas as pd 
import numpy as np 
import time

from sklearn import svm 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics
from os.path  import join 
from array import array 


In [None]:
os.listdir(join(input_path))

In [None]:
input_path ="../../data/mnist/raw" 
training_images_filepath = join(input_path, 'train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte') 

In [None]:
def read_images_labels(images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels 

In [None]:
X_train, y_train = read_images_labels(training_images_filepath, training_labels_filepath) 
X_test, y_test = read_images_labels(test_images_filepath, test_labels_filepath)  

In [None]:
folds = KFold(n_splits = 5, shuffle = True, random_state = 1)
hyper_params = [ {'gamma': [1e-2, 1e-3, 1e-4],
                     'C': [5,10,15]}]

start = time.time() 
model = svm.SVC(kernel="rbf")
# set up GridSearchCV()
model_cv = GridSearchCV(estimator = model, 
                        param_grid = hyper_params, 
                        scoring= 'accuracy', 
                        cv = folds, 
                        n_jobs = -1, 
                        return_train_score=True, 
                        verbose=10)      

# fit the model
model_cv.fit(X_train, y_train) 
print("time in minutes is: ", (time.time()-start)/60)  

best_score = model_cv.best_score_
best_hyperparams = model_cv.best_params_
 

In [None]:
raw_path ="../../data/mnist/raw"
print(os.listdir(raw_path))

In [None]:
root = "../../data/mnist/imputed"
experiment_settings = os.listdir(root)

In [None]:
for exp in experiment_settings:
    print(exp)

In [None]:
os.listdir(os.path.join(root,experiment_settings[0]))

In [None]:
def svm_pipeline():
    sub_folder = 'threshold_50_deletedWidthHeightPc_4040_noImagePc_50'


In [None]:
sub_folder = 'threshold_50_deletedWidthHeightPc_4040_noImagePc_50'
path = os.path.join(root+"/"+sub_folder+'/')

get_Xpath = lambda train_test, algo: os.path.join(path+'{}_{}.csv'.format(train_test, algo))
get_ypath = lambda train_test: os.path.join(path+'y_{}.csv'.format(train_test))

softImpute_Xtrain_path = get_Xpath('train','softImpute')
softImpute_Xtest_path = get_Xpath('train','softImpute')
softImpute_ytrain_path = get_ypath('test')
softImpute_ytest_path = get_ypath('test')

impDi_Xtrain_path = get_Xpath('train','impDi') 
impDi_Xtest_path = get_Xpath('test','impDi') 
impDi_ytrain_path = get_ypath('train')
impDi_ytest_path = get_ypath('test')


start_reading = time.time()
softImpute_Xtrain = pd.read_csv(softImpute_Xtrain_path)
softImpute_ytrain = pd.read_csv(softImpute_ytrain_path)
softImpute_Xtest = pd.read_csv(softImpute_Xtest_path)
softImpute_ytest = pd.read_csv(softImpute_ytest_path)

impDi_Xtrain = pd.read_csv(impDi_Xtrain_path)
impDi_ytrain = pd.read_csv(impDi_ytrain_path)
impDi_Xtest = pd.read_csv(impDi_Xtest_path)
impDi_ytest = pd.read_csv(impDi_ytest_path)  

print("complete reading imputed file after: {} second".format(time.time()-start_reading))  

In [None]:
# start_reading = time.time()
# softImpute_Xtrain = pd.read_csv(softImpute_Xtrain_path)
# softImpute_ytrain = pd.read_csv(softImpute_ytrain_path)
# softImpute_Xtest = pd.read_csv(softImpute_Xtest_path)
# softImpute_ytest = pd.read_csv(softImpute_ytest_path)

# impDi_Xtrain = pd.read_csv(impDi_Xtrain_path)
# impDi_ytrain = pd.read_csv(impDi_ytrain_path)
# impDi_Xtest = pd.read_csv(impDi_Xtest_path)
# impDi_ytest = pd.read_csv(impDi_ytest_path)  

# print("complete reading imputed file after: {} second".format(time.time()-start_reading))  

In [None]:
#find hyperparams 

X_train = softImpute_Xtrain
y_train = softImpute_ytrain.values.ravel()

folds = KFold(n_splits = 5, shuffle = True, random_state = 1)
hyper_params = [ {'gamma': [1e-3, 1e-4],
                     'C': [5,10]}]

start = time.time() 
model = svm.SVC(kernel="rbf")
# set up GridSearchCV()
model_cv = GridSearchCV(estimator = model, 
                        param_grid = hyper_params, 
                        scoring= 'accuracy', 
                        cv = folds, 
                        n_jobs = -1, 
                        return_train_score=True, 
                        verbose=10)      

# fit the model
model_cv.fit(X_train, y_train) 
print("time in minutes is: ", (time.time()-start)/60) 

In [None]:
best_hyperparams = model_cv.best_params_ 
best_hyperparams

In [None]:
best_score = model_cv.best_score_
best_hyperparams = model_cv.best_params_

print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams)) 

In [None]:
best_hyperparams ={'C': 10, 'gamma': 0.001} 

In [None]:
scores = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1, verbose=10) 
print(scores)

In [None]:
np.mean(scores)

In [None]:
np.std(scores)

In [None]:
start_prediction = time.time()
 
X_train = impDi_Xtrain
y_train = impDi_ytrain.values.ravel() 

X_test = impDi_Xtest
y_test = impDi_ytest.values.ravel()  

model = svm.SVC(**best_hyperparams)
model.fit(X_train, y_train) 
y_pred = model.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred) 
print("fitting time: {} mins".format((time.time() - start_prediction)/60))  

In [None]:
X_test.shape

In [None]:
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
# ver cũ 