In [1]:
import os
import sys  
import pandas as pd 
import numpy as np 
import time
import json 

from sklearn import svm 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn import metrics
  
HYPERPARAMS = {'C': 5, 'gamma': 0.01}  

def svm_prediction(X_train, y_train, X_test, y_test):
    print("start prediction")
    model = svm.SVC(**HYPERPARAMS)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("done prediction")
    return y_pred 

def svm_prediction_pipeline(root, sub_folder):
    path = os.path.join(root+"imputed/"+sub_folder+'/')
    print("start reading data at path", path)

    get_Xpath = lambda train_test, algo: os.path.join(path+'{}_{}.csv'.format(train_test, algo))
    get_ypath = lambda train_test: os.path.join(path+'y_{}.csv'.format(train_test))

    softImpute_Xtrain_path = get_Xpath('train','softImpute')
    softImpute_Xtest_path = get_Xpath('train','softImpute')
    softImpute_ytrain_path = get_ypath('train')
    softImpute_ytest_path = get_ypath('test')

    impDi_Xtrain_path = get_Xpath('train','impDi') 
    impDi_Xtest_path = get_Xpath('train','impDi') 
    impDi_ytrain_path = get_ypath('train')
    impDi_ytest_path = get_ypath('test')

    start_reading = time.time()
    softImpute_Xtrain = pd.read_csv(softImpute_Xtrain_path)
    softImpute_ytrain = pd.read_csv(softImpute_ytrain_path)
    softImpute_Xtest = pd.read_csv(softImpute_Xtest_path)
    softImpute_ytest = pd.read_csv(softImpute_ytest_path)
    
    impDi_Xtrain = pd.read_csv(impDi_Xtrain_path)
    impDi_ytrain = pd.read_csv(impDi_ytrain_path)
    impDi_Xtest = pd.read_csv(impDi_Xtest_path)
    impDi_ytest = pd.read_csv(impDi_ytest_path)  

    softImpute_ytrain = softImpute_ytrain.values.ravel()
    softImpute_ytest = softImpute_ytest.values.ravel()
    impDi_ytrain = impDi_ytrain.values.ravel()
    impDi_ytest = impDi_ytest.values.ravel()
    print(softImpute_Xtrain.shape)
    print(softImpute_ytrain.shape)
    print("complete reading data in subfoler {} \n  after: {} second".format(
        sub_folder, 
        time.time()-start_reading)
    )

    # prediction 
    pred_file_path = lambda algo: os.path.join(root +'prediction_output/', sub_folder,'/',"".join([algo, '.csv']))
    acc_file_path = os.path.join(root +'accuracy/', ''.join([sub_folder, '.json']))
    print(pred_file_path("softImpute"))
    print(acc_file_path)

    
    start_prediction = time.time()
    softImpute_ypred = svm_prediction(
             softImpute_Xtrain, 
             softImpute_ytrain, 
             softImpute_Xtest, 
             softImpute_ytest
            )
    print("done prediction, start save ")
    softImpute_ypred.to_csv(prediction_path('softImpute'))
    softImputeAccuracy = metrics.accuracy_score(softImpute_ytest, softImpute_ypred)
    print("SoftImpute Predition Time {} mins".format((time.time()-start_prediction)/60))

    start_prediction = time.time()
    impDi_ypred = svm_prediction(
             impDi_Xtrain, 
             impDi_ytrain, 
             impDi_Xtest, 
             impDi_ytest
            )
    
    softImpute_ypred.to_csv(prediction_path('impDi'))
    impDiAccuracy = metrics.accuracy_score(impDi_ytest, impDi_ypred)

    print("Predition Time {} mins".format((time.time()-start_prediction)/60))

    acc = {
            sub_folder: {
                "softImpute": softImputeAccuracy, 
                "impDi" : impDiAccuracy
            }
        }

    print("acc", acc)
    with open(acc_file_path,'w') as f:
        json.dump(acc, f)

    return acc

  

In [None]:

root = '../../data/mnist/'
accuracies = {}
acc_path = '../../data/mnist/accuracy/v1.json'
sub_folders = os.listdir(os.path.join(root, 'imputed/'))
print(sub_folders)
acc = svm_prediction_pipeline(root, sub_folders[0]) 
# for sub_folder in sub_folders:
#     acc = svm_prediction_pipeline(root, sub_folder)
#     accuracies.update(acc)

# with open(acc_path,'w') as f:
#     json.dump(accuracies, f)

# print(accuracies)


['threshold_50_deletedWidthHeightPc_4040_noImagePc_50', 'threshold_50_deletedWidthHeightPc_8080_noImagePc_50', 'threshold_50_deletedWidthHeightPc_6060_noImagePc_50', 'threshold_30_deletedWidthHeightPc_4040_noImagePc_20', 'threshold_30_deletedWidthHeightPc_8080_noImagePc_20', 'threshold_30_deletedWidthHeightPc_6060_noImagePc_20', 'threshold_30_deletedWidthHeightPc_4040_noImagePc_50', 'threshold_30_deletedWidthHeightPc_8080_noImagePc_50', 'threshold_30_deletedWidthHeightPc_6060_noImagePc_50', 'threshold_50_deletedWidthHeightPc_4040_noImagePc_20', 'threshold_50_deletedWidthHeightPc_8080_noImagePc_20', 'threshold_50_deletedWidthHeightPc_6060_noImagePc_20']
start reading data at path ../../data/mnist/imputed/threshold_50_deletedWidthHeightPc_4040_noImagePc_50/
(60000, 784)
(60000,)
complete reading data in subfoler threshold_50_deletedWidthHeightPc_4040_noImagePc_50 
  after: 22.659607887268066 second
/softImpute.csv
../../data/mnist/accuracy/threshold_50_deletedWidthHeightPc_4040_noImagePc