In [3]:
import os
import glob
import copy
import shutil
import numpy as np
import pandas as pd

import importlib
import GANS
import xgboost as xgb
from GANS import *
importlib.reload(GANS)

from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score

In [6]:
def dataWrangling(fname, dropCols, catFea, label):


    if catFea:
        dfCat = pd.read_csv(fname, usecols = catFea)

        for cat in catFea:
            dfCat[cat] = dfCat[cat].apply(str)

        dfCat = pd.get_dummies(dfCat, prefix=catFea, dtype = float)
        df = pd.read_csv(fname)

        for cat in dfCat.columns:
            print(cat)
            df.loc[:, cat] = dfCat[cat]
            del dfCat[cat]

        del dfCat
        df = df.drop(catFea, axis=1)

    else:

        df = pd.read_csv(fname)

        catCols = df.select_dtypes(include=[
            'object']).columns.tolist()

        df = df.drop(dropCols, axis=1)

        for cat in catFea:
            df[cat] = df[cat].apply(str)
            
        if catCols:

            df = pd.get_dummies(df, prefix=(
                catCols + catFea), dtype = float)

    indepCols = [col for col in df.columns if col != label]
    return list(indepCols), df.copy()

def meanNormalization(df, indepCols, mndenom):

    if mndenom == 'range':
        denom = (df[indepCols].max() - df[indepCols].min())
    elif mndenom == 'std':
        denom = df[indepCols].std()

    df[indepCols] = (df[indepCols] - df[indepCols].mean()) / denom
    return df.copy()

def dataIngestPreProp(folderName, label, catFea, dropCols, mntype, mndenom):

    print('Ingesting data within memory, and performing pre-processing')

    DATA_DIR = '%s/data/%s/'%(os.getcwd(), folderName)
    PROCESSED_DATA_DIR = '%sprocessed/'%DATA_DIR

    if not os.path.isdir(PROCESSED_DATA_DIR):
        os.makedirs(PROCESSED_DATA_DIR)

    outFname = '%s%s (%s %s)'%(PROCESSED_DATA_DIR, folderName, mntype, mndenom)

    if os.path.isfile('%s train.csv'%outFname):
        train = pd.read_csv('%s train.csv'%outFname)

    else:

        fname = glob.glob('%s/*.csv'%DATA_DIR)[0]
        indepCols, df = dataWrangling(fname, dropCols, catFea, label)

        if mntype == 'all':
        
            df = meanNormalization(df.copy(), list(indepCols), mndenom)
            
            train, test = train_test_split(df, test_size=0.3, random_state=0,
                stratify = df[label])

        elif mntype == 'split':

            print('split')
            train, test = train_test_split(df, test_size=0.2, random_state=0,
                stratify = df[label])           

            train = meanNormalization(train.copy(), list(indepCols), mndenom)
            test = meanNormalization(test.copy(), list(indepCols), mndenom)

        train.to_csv('%s train.csv'%outFname, index = False)
        test.to_csv('%s test.csv'%outFname, index = False)
        del test

    return outFname, train.copy()

def trainGANS(train, label, rand_dim, base_n_count, epochs,
    batch_size, learning_rate, d_pre_train_steps, model,
    folderName):

    print('Training GANS')

    label_cols = [label]
    data_cols = [col for col in train.columns if col != label_cols[0]]
    MODEL_CACHE_DIR = '%s/cache/%s/'%(os.getcwd(), folderName.split('/')[0])

    modelPath = '%s%s_generator_model_weights_step_%d.h5'%(
        MODEL_CACHE_DIR, folderName.split('/')[1], (epochs - 1))

    if not os.path.exists(modelPath):

        k_d = 1  # number of critic network updates per adversarial training step
        k_g = 1  # number of generator network updates per adversarial training step

        if not os.path.isdir(MODEL_CACHE_DIR):
            os.makedirs(MODEL_CACHE_DIR)
        
        train_no_label = train[data_cols]

        arguments = [rand_dim, epochs, batch_size,  k_d, k_g, d_pre_train_steps,
            1000, learning_rate, base_n_count, MODEL_CACHE_DIR, None, None, None, True, folderName]

        if model == 'gan':
            adversarial_training_GAN(arguments, train_no_label, data_cols)
            
        elif model == 'cgan':
            adversarial_training_GAN(arguments, train, data_cols=data_cols,
                label_cols=label_cols)

        elif model == 'wgan':
            adversarial_training_WGAN(arguments, train_no_label, data_cols=data_cols)

        elif model == 'wcgan':
            adversarial_training_WGAN(arguments, train, data_cols=data_cols,
                label_cols=label_cols)

    return data_cols, label_cols, modelPath

def findSynthesizedData(rand_dim, data_cols, label_cols,
    base_n_count, model_name, train, outFname, modelPath):

    print('Producing generated examples for the minority class')
    balancedFname = '%s %s-balanced.csv'%(outFname, model_name)

    if os.path.isfile(balancedFname):
        return pd.read_csv(balancedFname).copy()

    else:

        label_counts = train[label_cols[0]].value_counts()
        gen_samples = label_counts[0] - label_counts[1]

        data_dim = len(data_cols)
        label_dim = len(label_cols)

        generator_model, discriminator_model, combined_model = define_models_CGAN(
            rand_dim, data_dim, label_dim, base_n_count)
        
        generator_model.load_weights(modelPath)
        z = np.random.normal(size=(gen_samples, rand_dim))
        labels = np.array([[1.]] * gen_samples)
        g_z = generator_model.predict([z, labels])
        dfTrueGen = pd.DataFrame(g_z, columns = (data_cols + label_cols))
        trainBalanced = train.append(dfTrueGen)
        trainBalanced.to_csv(balancedFname, index = False)
        
        return trainBalanced.copy()

def recall(preds, dtrain):
    labels = dtrain.get_label()
    return 'recall',  recall_score(labels, np.round(preds))

def precision(preds, dtrain):
    labels = dtrain.get_label()
    return 'precision',  precision_score(labels, np.round(preds))

def roc_auc(preds, dtrain):
    labels = dtrain.get_label()
    return 'roc_auc',  roc_auc_score(labels, preds)

def BaseMetrics(y_pred,y_true):

    TP = np.sum( (y_pred == 1) & (y_true == 1) )
    TN = np.sum( (y_pred == 0) & (y_true == 0) )
    FP = np.sum( (y_pred == 1) & (y_true == 0) )
    FN = np.sum( (y_pred == 0) & (y_true == 1) )
    
    return TP, TN, FP, FN

def applyModel(train_df, folderName, mntype, mndenom, model_name, outFname, y_col):

    if len(train_df) == 0:
        model_name = 'N/A'
        train_df = pd.read_csv('%s train.csv'%outFname)

    title = 'GAN ALGORITHM: %s | MEAN NORM TYPE: %s | MEAN NORM DENOM: %s'%(
        model_name.title(), mntype.title(), mndenom.title())

    print('Applying XGBoost to perform performance audit of %s'%title)
    test_df = pd.read_csv('%s test.csv'%outFname)

    X_col = test_df.columns.tolist()
    X_col.remove(y_col)
    # X_col = test_df.columns[:-1]
    # y_col = test_df.columns[-1]
    dtrain = xgb.DMatrix(train_df[X_col], train_df[y_col], feature_names=X_col)
    dtest = xgb.DMatrix(test_df[X_col], test_df[y_col], feature_names=X_col)

    xgb_params = {
        'objective': 'binary:logistic',
        'random_state': 0,
        'eval_metric': 'auc'
    }

    xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=100, 
                         verbose_eval=False,
                         early_stopping_rounds=20, 
                         evals=[(dtrain,'train'),(dtest,'test')],
                         evals_result = {},              
                         feval = recall, maximize=True
                        )

    y_pred = xgb_test.predict(dtest, ntree_limit=xgb_test.best_iteration+1)
#     y_true = test_df['Class'].values
    y_true = test_df[y_col].values

    TP, TN, FP, FN = BaseMetrics(np.round(y_pred),y_true)
    ACC = ( TP + TN ) / ( TP + TN + FP + FN )

    with open("%s performance.txt"%folderName, "a") as myfile: 

        myfile.write('\n%s'%title)
        myfile.write("\n\nnbest iteration: %d"%xgb_test.best_iteration)
        myfile.write("\nrecall: %f"%recall(y_pred, dtest)[1])
        myfile.write("\nprecision: %f"%precision(y_pred, dtest)[1])
        myfile.write("\nroc auc: %f"%roc_auc(y_pred, dtest)[1])
        myfile.write("\nTP: %d, TN: %d, FP: %d, FN: %d"%(TP, TN, FP, FN))
        myfile.write("\naccuracy: %f\n\n\n\n"%ACC)

def main(folderName = 'mortgage', label = 'TARGET', catFea = ['NAME_CONTRACT_TYPE',
    'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE',
    'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
    'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START',
    'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE',
    'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
    dropCols = [], rand_dim = 32, base_n_count = 128,
    epochs = 50001, batch_size = 64, learning_rate = 0.0001,
    d_pre_train_steps = 100, model = 'wcgan', mntype = 'split',
    mndenom = 'range', skip =False):

    outFname, train = dataIngestPreProp(
        folderName, label, catFea, dropCols, mntype, mndenom)
    
    if skip:
        trainBalanced = []
    else:
        data_cols, label_cols, modelPath = trainGANS(train[train[label] == 1].copy(
            ), label, rand_dim, base_n_count, epochs, batch_size,
            learning_rate, d_pre_train_steps, model, '%s/%s-%s-%s/'%(
                folderName, model, mntype, mndenom))

        trainBalanced = findSynthesizedData(rand_dim, data_cols, label_cols,
            base_n_count, model.upper(), train.copy(), outFname, modelPath)

    applyModel(trainBalanced.copy(), folderName, mntype, mndenom, model, outFname, label)

def iterateModels():

    for model in ['wcgan']:
        for mntype in ['all', 'split']:
            for mndenom in ['range', 'std']:
                main(mntype = mntype, mndenom = mndenom, model = model, skip = True)
                main(mntype = mntype, mndenom = mndenom, model = model)

In [1]:
from script import *
# dataIngestPreProp()
main()