In [1]:
import os, glob, pathlib, shutil, random
from config import Config
import pandas as pd
import numpy as np
import _pickle as pk
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter("ignore")
import datetime

In [2]:
cfg  = Config()

clfs = {
    "LR": LogisticRegression(random_state=cfg.RS), # Logistic Regression
    "SVC": SVC(random_state=cfg.RS), # Support Vector Classification
    "SGD": SGDClassifier(random_state=cfg.RS), # Stochastic Gradient Descent
    "PERCEPTRON": Perceptron(random_state=cfg.RS), # Perceptron
    "MLP": MLPClassifier(random_state=cfg.RS), #Multi-layer Perceptron
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "DT": DecisionTreeClassifier(criterion = 'entropy', random_state = cfg.RS),
    "GNB": GaussianNB(),
    "RF": RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = cfg.RS),
    "ADB": AdaBoostClassifier(n_estimators=100, random_state=cfg.RS)
}

datasets = [os.path.basename(dataPath) for dataPath 
            in glob.glob(os.path.join(cfg.ORG_DATA_PATH, "*.csv"))]

transformations = [
    (cfg.MM_ROWS_PATH, cfg.RLT_PER_ROWS), 
    (cfg.MM_COLS_PATH, cfg.RLT_PER_COLS), 
    (cfg.UB_DATA_PATH, cfg.RLT_UB_DATA), 
    (cfg.NE_DATA_PATH, cfg.RLT_NE_DATA)
]

In [3]:
def getCurrentTime():
    return str(datetime.datetime.now())

def fix_parent_path(rootPath):
    pathlib.Path(os.path.dirname(rootPath)).mkdir(parents=True, exist_ok=True)
    
def save_raw_clf(kind, clf, cfg):
    #print(clf)
    rawClfPath = os.path.join(cfg.MODEL_CKPOINT, kind + '.pk')
    fix_parent_path(rawClfPath)
    with open(rawClfPath, 'wb') as fclf:
        pk.dump(clf, fclf)

def load_raw_clf(kind, cfg):
    clf = None
    rawClfPath = os.path.join(cfg.MODEL_CKPOINT, kind + '.pk')
    with open(rawClfPath, 'rb') as fclf:
        clf = pk.load(fclf)
    return clf

In [4]:
print("\nSaving checkpoints for ML classifiers...\n")
for kind in clfs.keys():
    save_raw_clf(kind, clfs[kind], cfg)


Saving checkpoints for ML classifiers...



In [5]:
def load_dataset(filePath):
    df = pd.read_csv(filePath, index_col=None)
    correct_labels = df.iloc[:,-1]
    feature_vectors = df.drop(df.columns[-1], axis=1)
    X_all, y_all = feature_vectors.to_numpy(), correct_labels
    return X_all, y_all

In [6]:
def evaluate_model(cfg, kind, dataPath):
    X_all, y_all = load_dataset(dataPath)
    clf = load_raw_clf(kind, cfg)
    clf.fit(X_all, y_all)
    y_pred = clf.predict(X_all)
    accscore = accuracy_score(y_all, y_pred)
    return accscore

In [7]:
def evaluate_transformation(cfg, rootPath, kind, dataset):
    traDict = {}
    for traPath in glob.glob(os.path.join(rootPath, "*")):
        traKind = traPath.split(os.path.sep)[-1]
        traResult = evaluate_model(cfg, kind, os.path.join(traPath, dataset))
        traDict[traKind] = traResult
    return traDict

In [8]:
def get_result_as_dict(cfg, kind, dataset, rootPath):
    results = {'dataset': dataset}
    orgResult = evaluate_model(cfg, kind, os.path.join(cfg.ORG_DATA_PATH, dataset))
    results.update({'original': orgResult})
    results.update(evaluate_transformation(cfg, rootPath, kind, dataset))
    return results

In [9]:
print("\nComputing results on transformations...\n")
for kind in clfs.keys():
    for traPaths in transformations:
        traKind = traPaths[0].split(os.path.sep)[-1]
        resultList = []
        for dataset in datasets:
            curResult = get_result_as_dict(cfg, kind, dataset, traPaths[0])
            resultList.append(curResult)
        resultPath = os.path.join(traPaths[1], kind + ".csv")
        fix_parent_path(resultPath)
        df = pd.DataFrame(resultList)
        colOrder = ['dataset' , 'original']
        df = df[ colOrder + [c for c in list(df.columns) if c not in colOrder]]
        df.to_csv(resultPath, index=False)
        print("[{}] Completed {} for {}".format(getCurrentTime(), kind, traKind))


Computing results on transformations...

[2020-04-20 14:17:15.644984] Completed LR for rows_permutation
[2020-04-20 14:17:16.355026] Completed LR for cols_permutation
[2020-04-20 14:17:16.978836] Completed LR for unbalanced
[2020-04-20 14:17:17.777895] Completed LR for nonequivalent
[2020-04-20 14:17:19.455908] Completed SVC for rows_permutation
[2020-04-20 14:17:20.643970] Completed SVC for cols_permutation
[2020-04-20 14:17:21.748970] Completed SVC for unbalanced
[2020-04-20 14:17:23.197720] Completed SVC for nonequivalent
[2020-04-20 14:17:23.756968] Completed SGD for rows_permutation
[2020-04-20 14:17:24.126610] Completed SGD for cols_permutation
[2020-04-20 14:17:24.497007] Completed SGD for unbalanced
[2020-04-20 14:17:25.146422] Completed SGD for nonequivalent
[2020-04-20 14:17:25.678378] Completed PERCEPTRON for rows_permutation
[2020-04-20 14:17:26.016519] Completed PERCEPTRON for cols_permutation
[2020-04-20 14:17:26.349987] Completed PERCEPTRON for unbalanced
[2020-04-20 14