In [1]:
import xgboost
import warnings
import itertools

import numpy as np
import pandas as pd

from datetime import datetime

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from costcla.metrics import cost_loss, savings_score
from costcla.models import BayesMinimumRiskClassifier, ThresholdingOptimization
from costcla.models import CostSensitiveDecisionTreeClassifier, CostSensitiveLogisticRegression
from costcla.models import CostSensitiveRandomForestClassifier, CostSensitiveBaggingClassifier, CostSensitivePastingClassifier, CostSensitiveRandomPatchesClassifier

RANDOM_STATE = 42
N_JOBS = -1

warnings.filterwarnings('ignore')
np.random.seed(RANDOM_STATE)



In [2]:
def create_cost_matrix(df, fp_cost, fn_cost, tp_cost, tn_cost):
    # false positives, false negatives, true positives, true negatives
    def generate_cost(df, cost):
        return df[cost] if type(cost) == str else cost
    
    cost_matrix = np.zeros((df.shape[0], 4))
    
    cost_matrix[:, 0] = generate_cost(df, fp_cost)
    cost_matrix[:, 1] = generate_cost(df, fn_cost)
    cost_matrix[:, 2] = generate_cost(df, tp_cost)
    cost_matrix[:, 3] = generate_cost(df, tn_cost)
    
    return cost_matrix


def generate_rf_models():

    max_depth = [None, 1, 2, 3, 4, 5]
    n_estimatiors = [10, 50, 100, 200, 500]

    rf_models = {
        f'CI-GS_RandomForest-n_est_{n_est}_md_{md}': RandomForestClassifier(
            random_state=RANDOM_STATE,
            n_estimators=n_est,
            max_depth=md,
            n_jobs=N_JOBS
        )
        for n_est, md in itertools.product(n_estimatiors, max_depth)
    }
    return rf_models


def generate_xgb_models():
    
    max_depth = [0, 1, 2, 3, 4, 5]
    subsample = [0.5, 0.75, 1]
    colsample_bytree = [0.5, 0.75, 1]

    xgb_models = {
        f'CI-GS_XGBoost-md_{md}_subs_{subs}_cs_bt_{cs_bt}': xgboost.XGBClassifier(
            random_state=RANDOM_STATE,
            verbosity=0,
            max_depth=md,
            subsample=subs,
            colsample_bytree=cs_bt,
            n_jobs=N_JOBS
        )
        for md, subs, cs_bt in itertools.product(max_depth, subsample, colsample_bytree)
    }
    return xgb_models


def generate_cost_sensitive_ensemble(model, name):
    
    combinations = ['majority_voting', 'weighted_voting', 'stacking', 'stacking_proba', 
                    'stacking_bmr', 'stacking_proba_bmr', 'majority_bmr', 'weighted_bmr']
    n_estimatiors = [10, 20, 30]
    
    cs_ensemble_models = {
        f'ECSDT-GS_{name}_{combination}-n_est_{n_est}': model(
            n_estimators=n_est,
            combination=combination,
            n_jobs=N_JOBS
        )
        for n_est, combination in itertools.product(n_estimatiors, combinations)
    }
    return cs_ensemble_models
    


def generate_models():
    
    csrfc = generate_cost_sensitive_ensemble(CostSensitiveRandomForestClassifier, 
                                             'CostSensitiveRandomForestClassifier')
    csbc = generate_cost_sensitive_ensemble(CostSensitiveBaggingClassifier,
                                           'CostSensitiveBaggingClassifier')
    cspc = generate_cost_sensitive_ensemble(CostSensitivePastingClassifier,
                                           'CostSensitivePastingClassifier')
    csrpc = generate_cost_sensitive_ensemble(CostSensitiveRandomPatchesClassifier,
                                            'CostSensitiveRandomPatchesClassifier')

    gs_rf_models = generate_rf_models()
    gs_xgb_models = generate_xgb_models()
    
    models = {
        'CI-LogisticRegression': LogisticRegression(), 
        'CI-DecisionTree': DecisionTreeClassifier(random_state=RANDOM_STATE), 
        'CI-RandomForest': RandomForestClassifier(random_state=RANDOM_STATE),
        'CI-XGBoost': xgboost.XGBClassifier(random_state=RANDOM_STATE,verbosity=0),
        'CST-CostSensitiveLogisticRegression': CostSensitiveLogisticRegression(),
        'CST-CostSensitiveDecisionTreeClassifier': CostSensitiveDecisionTreeClassifier()
    }
    #models.update(csrfc)
    #models.update(csbc)
    #models.update(cspc)
    #models.update(csrpc)
    #models.update(gs_rf_models)
    #models.update(gs_xgb_models)
    
    return models


def create_model_summary(model, name, X, y, cost_matrix):
    print(name)
    if name.startswith('BMR'):
        # BMR Model
        model, bmr = model
        y_hat_proba = model.predict_proba(X)
        y_hat = bmr.predict(y_hat_proba, cost_matrix)
    elif name.startswith('TO'):
        # Threshold Optimized Model
        model, threshold_opt = model
        y_hat_proba = model.predict_proba(X)
        y_hat = threshold_opt.predict(y_hat_proba)
    elif name.startswith('ECSDT'):
        y_hat = model.predict(X, cost_matrix)
    else:
        y_hat = model.predict(X)
        
        
    return {
        'Name': name,
        'Accuracy': accuracy_score(y, y_hat),
        'Precision': precision_score(y, y_hat),
        'Recall': recall_score(y, y_hat),
        'F1': f1_score(y, y_hat),
        'Cost': cost_loss(y, y_hat, cost_matrix),
        'Savings': savings_score(y, y_hat, cost_matrix)
    }


def create_bmr_model(model, name, X_val, y_val, calibration = True):
    
    y_hat_val_proba = model.predict_proba(X_val)

    bmr = BayesMinimumRiskClassifier(calibration = calibration)
    bmr.fit(y_val, y_hat_val_proba)
    
    prefix = 'BMR' + '_calibration_' if calibration else 'BMR_'
    name = prefix + name
    
    return (name, (model, bmr))


def create_threshold_optimized_model(model, name, X_train, y_train, cost_matrix_train, calibration = True):
        
    y_hat_train_proba = model.predict_proba(X_train)

    threshold_opt = ThresholdingOptimization(calibration = calibration)
    threshold_opt.fit(y_hat_train_proba, cost_matrix_train, y_train)
    
    prefix = 'TO' + '_calibration_' if calibration else 'TO_'
    name = prefix + name
    
    return (name, (model, threshold_opt))

In [3]:
OPERATIONAL_COST = 2.5

In [4]:
df = pd.read_csv('data/creditcard.csv')
cost_matrix = create_cost_matrix(df, OPERATIONAL_COST, 'Amount', OPERATIONAL_COST, 0)

X = df.drop(['Time', 'Amount', 'Class'], axis = 1)
y = df['Class']

In [5]:
X_train, X_test, y_train, y_test, cost_matrix_train, cost_matrix_test = train_test_split(X, y, cost_matrix, train_size = 0.5, stratify = y, random_state = RANDOM_STATE)
X_val, X_test, y_val, y_test, cost_matrix_val, cost_matrix_test = train_test_split(X_test, y_test, cost_matrix_test, train_size = 0.33, stratify = y_test, random_state = RANDOM_STATE)

In [6]:
# Downsampling
"""
X_train = pd.concat([X_train.reset_index(), pd.DataFrame(cost_matrix_train)], axis = 1).set_index('index')
X_train['Class'] = y_train

X_train = pd.concat([
    X_train[X_train['Class'] == 0].sample(frac = 0.05, random_state=RANDOM_STATE),
    X_train[X_train['Class'] == 1]
])

y_train = X_train['Class']
cost_matrix_train = X_train[[0, 1, 2, 3]].values
X_train = X_train.drop(['Class', 0, 1, 2, 3], axis = 1)
"""
pass

In [7]:
models = generate_models()

In [8]:
def filter_model_names(models, value):
    return [name for name in models.keys() if value in name]
    

standard_model_names = filter_model_names(models, 'GS_RandomForest') + ['CI-LogisticRegression', 'CI-DecisionTree', 'CI-RandomForest']
cost_sensitive_model_names = filter_model_names(models, 'CST') + filter_model_names(models, 'ECSDT')
xgb_model_names = filter_model_names(models, 'XGBoost')
calibration_model_names = standard_model_names + xgb_model_names

# Standard model training

for name in standard_model_names:
    print(name)
    models[name].fit(X_train.values, y_train.values)

    
for name in xgb_model_names:
    print(name)
    models[name].fit(
        X_train.values, y_train.values, 
        eval_set = [(X_val.values, y_val.values), (X_train.values, y_train.values)],
        eval_metric = 'aucpr',
        early_stopping_rounds = 50,
        verbose = False
    )       
    
    
for name in cost_sensitive_model_names:
    print(name)
    models[name].fit(X_train.values, y_train.values, cost_matrix_train)
    

CI-LogisticRegression
CI-DecisionTree
CI-RandomForest
CI-XGBoost
CST-CostSensitiveLogisticRegression
CST-CostSensitiveDecisionTreeClassifier


In [9]:
    # Threshold Optimization training

    for name in calibration_model_names:
        for calibration in [True]:
            print(name)
            model = models[name]
            name_threshold_opt, model_threshold_opt = create_threshold_optimized_model(model, name, X_train.values, y_train.values, cost_matrix_train, calibration = calibration)
            models[name_threshold_opt] = model_threshold_opt

CI-LogisticRegression
CI-LogisticRegression


KeyboardInterrupt: 

In [10]:
# BMR training

for name in calibration_model_names:
    for calibration in [True, False]:
        print(name)
        model = models[name]
        name_bmr, model_bmr = create_bmr_model(model, name, X_val.values, y_val.values, calibration = calibration)
        models[name_bmr] = model_bmr

CI-LogisticRegression
CI-LogisticRegression
CI-DecisionTree
CI-DecisionTree
CI-RandomForest
CI-RandomForest
CI-XGBoost
CI-XGBoost


In [11]:
filepath = 'outputs/' + 'Training-results-' + datetime.now().isoformat('-', timespec = 'minutes') + '.csv'


temp = []
for name, model in models.items():
    try:
        temp.append(create_model_summary(model, name, X_test.values, y_test.values, cost_matrix_test))
    except:
        pass
results = pd.DataFrame(temp)


#results = pd.DataFrame([create_model_summary(model, name, X_test.values, y_test.values, cost_matrix_test) for name, model in models.items()])
#results.to_csv(filepath, index=False)
results

CI-LogisticRegression
CI-DecisionTree
CI-RandomForest
CI-XGBoost
CST-CostSensitiveLogisticRegression
CST-CostSensitiveDecisionTreeClassifier
TO_calibration_CI-LogisticRegression
BMR_calibration_CI-LogisticRegression
BMR_CI-LogisticRegression
BMR_calibration_CI-DecisionTree
BMR_CI-DecisionTree
BMR_calibration_CI-RandomForest
BMR_CI-RandomForest
BMR_calibration_CI-XGBoost
BMR_CI-XGBoost


Unnamed: 0,Accuracy,Cost,F1,Name,Precision,Recall,Savings
0,0.999182,6169.57,0.734694,CI-LogisticRegression,0.837209,0.654545,0.594864
1,0.999151,5708.01,0.750769,CI-DecisionTree,0.7625,0.739394,0.625174
2,0.999465,5017.6,0.827119,CI-RandomForest,0.938462,0.739394,0.670511
3,0.999403,6621.24,0.809365,CI-XGBoost,0.902985,0.733333,0.565205
4,0.337204,163631.29,0.003121,CST-CostSensitiveLogisticRegression,0.001565,0.6,-9.745133
5,0.998952,4859.59,0.640288,CST-CostSensitiveDecisionTreeClassifier,0.787611,0.539394,0.680887
6,0.999141,6077.9,0.738854,TO_calibration_CI-LogisticRegression,0.778523,0.70303,0.600884
7,0.997432,4345.56,0.366925,BMR_calibration_CI-LogisticRegression,0.31982,0.430303,0.714641
8,0.996908,4499.39,0.312354,BMR_CI-LogisticRegression,0.253788,0.406061,0.70454
9,0.998669,5585.96,0.501961,BMR_calibration_CI-DecisionTree,0.711111,0.387879,0.633188


In [12]:
results.sort_values('Savings')

Unnamed: 0,Accuracy,Cost,F1,Name,Precision,Recall,Savings
4,0.337204,163631.29,0.003121,CST-CostSensitiveLogisticRegression,0.001565,0.6,-9.745133
3,0.999403,6621.24,0.809365,CI-XGBoost,0.902985,0.733333,0.565205
0,0.999182,6169.57,0.734694,CI-LogisticRegression,0.837209,0.654545,0.594864
6,0.999141,6077.9,0.738854,TO_calibration_CI-LogisticRegression,0.778523,0.70303,0.600884
1,0.999151,5708.01,0.750769,CI-DecisionTree,0.7625,0.739394,0.625174
9,0.998669,5585.96,0.501961,BMR_calibration_CI-DecisionTree,0.711111,0.387879,0.633188
10,0.998742,5568.46,0.516129,BMR_CI-DecisionTree,0.771084,0.387879,0.634337
2,0.999465,5017.6,0.827119,CI-RandomForest,0.938462,0.739394,0.670511
5,0.998952,4859.59,0.640288,CST-CostSensitiveDecisionTreeClassifier,0.787611,0.539394,0.680887
8,0.996908,4499.39,0.312354,BMR_CI-LogisticRegression,0.253788,0.406061,0.70454


### TODO:
- Second jupyter notebook with results analysis
- Cross Validation (?)
- Rewrite this notebook to script?
- Make whole experiment with respect to differenct Operational Cost