In [1]:
import os, glob, pathlib, shutil, random
from config import Config
import pandas as pd
import numpy as np
mRS = 42

In [2]:
def save_file(df, kind, filePath, fileName):
    df = df.reset_index(drop=True)
    newFilePath = os.path.join(filePath, kind, fileName)
    pathlib.Path(os.path.dirname(newFilePath)).mkdir(parents=True, exist_ok=True)
    df.to_csv(newFilePath, index=False)

In [3]:
def apply_unbalanced(cfg, df_org, fileName):
    
    def ub_window_size(df, filename):
        df = df.tail(int(len(df)/2))
        save_file(df, "window_size", cfg.UB_DATA_PATH, fileName)          
    ub_window_size(df_org.copy(), fileName)
    print("\tCompleted UbWs: window_size Unbalanced.")
    
    def ub_random_selection(df, fileName):
        cols = list(df.columns)
        randNcol = random.Random(mRS).randint(2, len(cols[:-1])-2)
        cols = random.Random(mRS).sample(cols[:-1], randNcol) + [cols[-1]]
        df = df[cols]
        save_file(df, "random_selection", cfg.UB_DATA_PATH, fileName)
    ub_random_selection(df_org.copy(), fileName)
    print("\tCompleted UbRs: Random Selection Unbalanced.")
    
    def ub_random_deletion(df, fileName):
        cols = list(df.columns)
        randNcol = random.Random(mRS).randint(2, len(cols[:-1])-2)
        cols = random.Random(mRS).sample(cols[:-1], randNcol)
        df = df.drop(cols, axis=1)
        save_file(df, "random_deletion", cfg.UB_DATA_PATH, fileName)
    ub_random_deletion(df_org.copy(), fileName)
    print("\tCompleted UbRd: Random Deleting Unbalanced.")
    
    def ub_remove_last_feature(df, fileName):
        cols = list(df.columns)
        lastfeature = cols[-2]
        df = df.drop(lastfeature, axis=1)
        save_file(df, "remove_last_feature", cfg.UB_DATA_PATH, fileName)          
    ub_remove_last_feature(df_org.copy(), fileName)
    print("\tCompleted UnBa: Removing Last Feature Unbalanced.")

In [4]:
cfg = Config()
fileNames = [os.path.basename(filePath) for filePath in glob.glob(os.path.join(cfg.ORG_DATA_PATH, "*.csv"))]
for fileName in fileNames:
    orgFilePath = os.path.join(cfg.ORG_DATA_PATH, fileName)
    df_org = pd.read_csv(orgFilePath, delimiter=",", index_col=False)
    print("Original Dataset={} [(rows, cols) = {}]".format(fileName, df_org.shape))
    print("Creating unbalanced dataset:")
    apply_unbalanced(cfg, df_org.copy(), fileName)
    print("\n")

Original Dataset=SE_Process.csv [(rows, cols) = (793, 85)]
Creating unbalanced dataset:
	Completed UbWs: window_size Unbalanced.
	Completed UbRs: Random Selection Unbalanced.
	Completed UbRd: Random Deleting Unbalanced.
	Completed UnBa: Removing Last Feature Unbalanced.


Original Dataset=Immuno_Therapy.csv [(rows, cols) = (90, 8)]
Creating unbalanced dataset:
	Completed UbWs: window_size Unbalanced.
	Completed UbRs: Random Selection Unbalanced.
	Completed UbRd: Random Deleting Unbalanced.
	Completed UnBa: Removing Last Feature Unbalanced.


Original Dataset=German_Credit.csv [(rows, cols) = (1000, 25)]
Creating unbalanced dataset:
	Completed UbWs: window_size Unbalanced.
	Completed UbRs: Random Selection Unbalanced.
	Completed UbRd: Random Deleting Unbalanced.
	Completed UnBa: Removing Last Feature Unbalanced.


Original Dataset=Lung_Cancer.csv [(rows, cols) = (27, 57)]
Creating unbalanced dataset:
	Completed UbWs: window_size Unbalanced.
	Completed UbRs: Random Selection Unbalanced.
