In [1]:
import os, glob, pathlib, shutil, random
from config import Config
import pandas as pd
import numpy as np
import _pickle as pk
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import warnings
warnings.simplefilter("ignore")
import datetime

In [2]:
mRS = 42
clfs = {
    "LR": LogisticRegression(random_state=mRS), # Logistic Regression
    "SVC": SVC(random_state=mRS), # Support Vector Classification
    "SGD": SGDClassifier(random_state=mRS), # Stochastic Gradient Descent
    "Perceptron": Perceptron(random_state=mRS), # Perceptron
    "MLP": MLPClassifier(random_state=mRS), #Multi-layer Perceptron
}
cfg  = Config()

In [3]:
def getCurrentTime():
    return str(datetime.datetime.now())

def fix_parent_path(rootPath):
    pathlib.Path(os.path.dirname(rootPath)).mkdir(parents=True, exist_ok=True)
    
def save_raw_clf(kind, clf, cfg):
    #print(clf)
    rawClfPath = os.path.join(cfg.Checkpoint, kind + '.pk')
    fix_parent_path(rawClfPath)
    with open(rawClfPath, 'wb') as fclf:
        pk.dump(clf, fclf)

def load_raw_clf(kind, cfg):
    clf = None
    rawClfPath = os.path.join(cfg.Checkpoint, kind + '.pk')
    with open(rawClfPath, 'rb') as fclf:
        clf = pk.load(fclf)
    return clf

In [4]:
print("\nSaving checkpoints for ML classifiers...\n")
for kind in clfs.keys():
    save_raw_clf(kind, clfs[kind], cfg)


Saving checkpoints for ML classifiers...



In [5]:
def load_dataset(filePath):
    df = pd.read_csv(filePath, index_col=None)
    correct_labels = df.iloc[:,-1]
    feature_vectors = df.drop(df.columns[-1], axis=1)
    X_all, y_all = feature_vectors.to_numpy(), correct_labels
    return X_all, y_all

In [6]:
def evaluate_model(cfg, kind, dataPath):
    X_all, y_all = load_dataset(dataPath)
    clf = load_raw_clf(kind, cfg)
    clf.fit(X_all, y_all)
    y_pred = clf.predict(X_all)
    accscore = accuracy_score(y_all, y_pred)
    return accscore

In [7]:
def evaluate_transformation(cfg, rootPath, kind, dataset):
    traDict = {}
    for traPath in glob.glob(os.path.join(rootPath, "*")):
        traKind = traPath.split(os.path.sep)[-1]
        traResult = evaluate_model(cfg, kind, os.path.join(traPath, dataset))
        traDict[traKind] = traResult
    return traDict

In [8]:
print("\nComputing results on transformations...\n")
datasets = [os.path.basename(dataPath) for dataPath in glob.glob(os.path.join(cfg.ORG_DATA_PATH, "*.csv"))]
datasets = [d for d in datasets if d not in ['Census_Income.csv', 'Occupancy.csv', 'Communities_Crime.csv']]

for kind in clfs.keys():
    rowList, colList = [], []
    for dataset in datasets:
        orgResult = evaluate_model(cfg, kind, os.path.join(cfg.ORG_DATA_PATH, dataset))
        
        rowDict = {'dataset': dataset}
        rowDict.update({'original': orgResult})
        rowDict.update(evaluate_transformation(cfg, cfg.MM_ROWS_PATH, kind, dataset))
        rowList.append(rowDict)
        
        colDict = {'dataset': dataset}
        colDict.update({'original': orgResult})
        colDict.update(evaluate_transformation(cfg, cfg.MM_COLS_PATH, kind, dataset))
        colList.append(colDict)
        
        print("[{}] Completed {} on {}".format(getCurrentTime(), kind, dataset))

    rowResultPath = os.path.join(cfg.RLT_TRA_ROWS, kind + ".csv")
    fix_parent_path(rowResultPath)
    pd.DataFrame(rowList).to_csv(rowResultPath, index=False)
    
    colResultPath = os.path.join(cfg.RLT_TRA_COLS, kind + ".csv")
    fix_parent_path(colResultPath)
    pd.DataFrame(colList).to_csv(colResultPath, index=False)


Computing results on transformations...

[2020-04-15 23:08:14.248643] Completed LR on SE_Process.csv
[2020-04-15 23:08:14.286416] Completed LR on Immuno_Therapy.csv
[2020-04-15 23:08:14.400304] Completed LR on German_Credit.csv
[2020-04-15 23:08:14.464546] Completed LR on Lung_Cancer.csv
[2020-04-15 23:08:14.841010] Completed LR on Voice_Rehabilitation.csv
[2020-04-15 23:08:14.895318] Completed LR on Breast_Cancer.csv
[2020-04-15 23:08:16.156220] Completed SVC on SE_Process.csv
[2020-04-15 23:08:16.223467] Completed SVC on Immuno_Therapy.csv
[2020-04-15 23:08:16.970560] Completed SVC on German_Credit.csv
[2020-04-15 23:08:17.040241] Completed SVC on Lung_Cancer.csv
[2020-04-15 23:08:17.406920] Completed SVC on Voice_Rehabilitation.csv
[2020-04-15 23:08:17.590046] Completed SVC on Breast_Cancer.csv
[2020-04-15 23:08:17.821168] Completed SGD on SE_Process.csv
[2020-04-15 23:08:17.854709] Completed SGD on Immuno_Therapy.csv
[2020-04-15 23:08:17.957486] Completed SGD on German_Credit.csv
