In [1]:
import xgboost
import warnings

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from costcla.metrics import cost_loss, savings_score
from costcla.models import BayesMinimumRiskClassifier, ThresholdingOptimization
from costcla.models import CostSensitiveDecisionTreeClassifier, CostSensitiveLogisticRegression
from costcla.models import CostSensitiveRandomForestClassifier, CostSensitiveBaggingClassifier, CostSensitivePastingClassifier, CostSensitiveRandomPatchesClassifier

warnings.filterwarnings('ignore')
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)



In [31]:
def create_cost_matrix(df, fp_cost, fn_cost, tp_cost, tn_cost):
    # false positives, false negatives, true positives, true negatives
    def generate_cost(df, cost):
        return df[cost] if type(cost) == str else cost
    
    cost_matrix = np.zeros((df.shape[0], 4))
    
    cost_matrix[:, 0] = generate_cost(df, fp_cost)
    cost_matrix[:, 1] = generate_cost(df, fn_cost)
    cost_matrix[:, 2] = generate_cost(df, tp_cost)
    cost_matrix[:, 3] = generate_cost(df, tn_cost)
    
    return cost_matrix


def create_model_summary(model, name, X, y, cost_matrix):

    if '-bmr' in name:
        # BMR Model
        model, bmr = model
        y_hat_proba = model.predict_proba(X)
        y_hat = bmr.predict(y_hat_proba, cost_matrix)
    elif '-threshold-opt' in name:
        # Threshold Optimized Model
        model, threshold_opt = model
        y_hat_proba = model.predict_proba(X)
        y_hat = threshold_opt.predict(y_hat_proba)
    elif 'CostSensitive' in name:
        y_hat = model.predict(X, cost_matrix)
    else:
        y_hat = model.predict(X)
        
        
    return {
        'Name': name,
        'Accuracy': accuracy_score(y, y_hat),
        'Precision': precision_score(y, y_hat),
        'Recall': recall_score(y, y_hat),
        'F1': f1_score(y, y_hat),
        'Cost': cost_loss(y, y_hat, cost_matrix),
        'Savings': savings_score(y, y_hat, cost_matrix)
    }


def create_bmr_model(model, name, X_val, y_val, calibration = True):
    
    y_hat_val_proba = model.predict_proba(X_val)

    bmr = BayesMinimumRiskClassifier(calibration = calibration)
    bmr.fit(y_val, y_hat_val_proba)
    
    name = name + '-bmr'
    name = name + '-calibration' if calibration else name
    
    return (name, (model, bmr))


def create_threshold_optimized_model(model, name, X_train, y_train, cost_matrix_train, calibration = True):
        
    y_hat_train_proba = model.predict_proba(X_train)

    threshold_opt = ThresholdingOptimization(calibration = calibration)
    threshold_opt.fit(y_hat_train_proba, cost_matrix_train, y_train)
    
    name = name + '-threshold-opt'
    name = name + '-calibration' if calibration else name
    
    return (name, (model, threshold_opt))

In [3]:
OPERATIONAL_COST = 2.5

In [4]:
df = pd.read_csv('data/creditcard.csv')
cost_matrix = create_cost_matrix(df, OPERATIONAL_COST, 'Amount', OPERATIONAL_COST, 0)

X = df.drop(['Time', 'Amount', 'Class'], axis = 1)
y = df['Class']

In [5]:
X_train, X_test, y_train, y_test, cost_matrix_train, cost_matrix_test = train_test_split(X, y, cost_matrix, train_size = 0.5, stratify = y, random_state = RANDOM_STATE)
X_val, X_test, y_val, y_test, cost_matrix_val, cost_matrix_test = train_test_split(X_test, y_test, cost_matrix_test, train_size = 0.33, stratify = y_test, random_state = RANDOM_STATE)

In [6]:
combinations = ['majority_voting', 'weighted_voting', 'stacking', 'stacking_proba', 'stacking_bmr', 'stacking_proba_bmr', 'majority_bmr', 'weighted_bmr']

csrfc = {f'CostSensitiveRandomForestClassifier-{combination}': CostSensitiveRandomForestClassifier(combination = combination) for combination in combinations}
csbc = {f'CostSensitiveBaggingClassifier-{combination}': CostSensitiveBaggingClassifier(combination = combination) for combination in combinations}
cspc = {f'CostSensitivePastingClassifier-{combination}': CostSensitivePastingClassifier(combination = combination) for combination in combinations}
csrpc = {f'CostSensitiveRandomPatchesClassifier-{combination}': CostSensitiveRandomPatchesClassifier(combination = combination) for combination in combinations}
models = {
    'LogisticRegression': LogisticRegression(), 
    'DecisionTree': DecisionTreeClassifier(random_state=RANDOM_STATE), 
    'RandomForest': RandomForestClassifier(random_state=RANDOM_STATE),
    'XGBoost': xgboost.XGBClassifier(random_state=RANDOM_STATE,verbosity=0),
    'CSLogisticRegression': CostSensitiveLogisticRegression(),
    'CSDecisionTreeClassifier': CostSensitiveDecisionTreeClassifier()
}
models.update(csrfc)
models.update(csbc)
models.update(cspc)
models.update(csrpc)

In [7]:
# Regular training

for name in ['LogisticRegression', 'DecisionTree', 'RandomForest']:
    print(name)
    models[name].fit(X_train.values, y_train.values)

    
print('XGBoost')
models['XGBoost'].fit(
    X_train.values, y_train.values, 
    eval_set = [(X_val.values, y_val.values)],
    eval_metric = 'auc',
    early_stopping_rounds = 50,
    verbose = False
)       
    
    
for name in [name for name in models.keys() if 'CostSensitive' in name]:
    print(name)
    models[name].fit(X_train.values, y_train.values, cost_matrix_train)
     

LogisticRegression
DecisionTree
RandomForest
XGBoost
[0]	validation_0-auc:0.919494
Will train until validation_0-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.919529
[2]	validation_0-auc:0.919542
[3]	validation_0-auc:0.919548
[4]	validation_0-auc:0.919549
[5]	validation_0-auc:0.919549
[6]	validation_0-auc:0.919549
[7]	validation_0-auc:0.919542
[8]	validation_0-auc:0.919546
[9]	validation_0-auc:0.919549
[10]	validation_0-auc:0.91955
[11]	validation_0-auc:0.91956
[12]	validation_0-auc:0.919584
[13]	validation_0-auc:0.919582
[14]	validation_0-auc:0.919585
[15]	validation_0-auc:0.919585
[16]	validation_0-auc:0.919593
[17]	validation_0-auc:0.919593
[18]	validation_0-auc:0.919612
[19]	validation_0-auc:0.919614
[20]	validation_0-auc:0.919574
[21]	validation_0-auc:0.919564
[22]	validation_0-auc:0.919566
[23]	validation_0-auc:0.919566
[24]	validation_0-auc:0.919514
[25]	validation_0-auc:0.919513
[26]	validation_0-auc:0.919481
[27]	validation_0-auc:0.919437
[28]	validation_0-auc:0.919

In [None]:
# Hyperparameter search for regular models

# XGBoost
# Cost Sensitive Ensembles

In [9]:
# Threshold Optimization training

for name in ['LogisticRegression', 'DecisionTree', 'RandomForest', 'XGBoost']:
    for calibration in [True, False]:
        model = models[name]
        name_threshold_opt, model_threshold_opt = create_threshold_optimized_model(model, name, X_train.values, y_train.values, cost_matrix_train, calibration = calibration)
        models[name_threshold_opt] = model_threshold_opt

In [14]:
# BMR training

for name in ['LogisticRegression', 'DecisionTree', 'RandomForest', 'XGBoost']:
    for calibration in [True, False]:
        model = models[name]
        name_bmr, model_bmr = create_bmr_model(model, name, X_val.values, y_val.values, calibration = calibration)
        models[name_bmr] = model_bmr

In [17]:
models.keys()

dict_keys(['LogisticRegression', 'DecisionTree', 'RandomForest', 'XGBoost', 'CostSensitiveLogisticRegression', 'CostSensitiveDecisionTreeClassifier', 'CostSensitiveRandomForestClassifier-majority_voting', 'CostSensitiveRandomForestClassifier-weighted_voting', 'CostSensitiveRandomForestClassifier-stacking', 'CostSensitiveRandomForestClassifier-stacking_proba', 'CostSensitiveRandomForestClassifier-stacking_bmr', 'CostSensitiveRandomForestClassifier-stacking_proba_bmr', 'CostSensitiveRandomForestClassifier-majority_bmr', 'CostSensitiveRandomForestClassifier-weighted_bmr', 'CostSensitiveBaggingClassifier-majority_voting', 'CostSensitiveBaggingClassifier-weighted_voting', 'CostSensitiveBaggingClassifier-stacking', 'CostSensitiveBaggingClassifier-stacking_proba', 'CostSensitiveBaggingClassifier-stacking_bmr', 'CostSensitiveBaggingClassifier-stacking_proba_bmr', 'CostSensitiveBaggingClassifier-majority_bmr', 'CostSensitiveBaggingClassifier-weighted_bmr', 'CostSensitivePastingClassifier-majori

In [32]:
results = pd.DataFrame([create_model_summary(model, name, X_test.values, y_test.values, cost_matrix_test) for name, model in models.items()])
results

LogisticRegression
DecisionTree
RandomForest
XGBoost
CostSensitiveLogisticRegression
CostSensitiveDecisionTreeClassifier
CostSensitiveRandomForestClassifier-majority_voting
CostSensitiveRandomForestClassifier-weighted_voting
CostSensitiveRandomForestClassifier-stacking
CostSensitiveRandomForestClassifier-stacking_proba
CostSensitiveRandomForestClassifier-stacking_bmr
CostSensitiveRandomForestClassifier-stacking_proba_bmr
CostSensitiveRandomForestClassifier-majority_bmr
CostSensitiveRandomForestClassifier-weighted_bmr
CostSensitiveBaggingClassifier-majority_voting
CostSensitiveBaggingClassifier-weighted_voting
CostSensitiveBaggingClassifier-stacking
CostSensitiveBaggingClassifier-stacking_proba
CostSensitiveBaggingClassifier-stacking_bmr
CostSensitiveBaggingClassifier-stacking_proba_bmr
CostSensitiveBaggingClassifier-majority_bmr
CostSensitiveBaggingClassifier-weighted_bmr
CostSensitivePastingClassifier-majority_voting
CostSensitivePastingClassifier-weighted_voting
CostSensitivePastingC

Unnamed: 0,Accuracy,Cost,F1,Name,Precision,Recall,Savings
0,0.999182,6169.57,0.734694,LogisticRegression,0.837209,0.654545,0.594864
1,0.999151,5708.01,0.750769,DecisionTree,0.7625,0.739394,0.625174
2,0.999465,5017.6,0.827119,RandomForest,0.938462,0.739394,0.670511
3,0.999413,6622.74,0.813333,XGBoost,0.903704,0.739394,0.565106
4,0.337204,163631.29,0.003121,CostSensitiveLogisticRegression,0.001565,0.6,-9.745133
5,0.998952,4859.59,0.640288,CostSensitiveDecisionTreeClassifier,0.787611,0.539394,0.680887
6,0.998941,5535.61,0.610039,CostSensitiveRandomForestClassifier-majority_v...,0.840426,0.478788,0.636495
7,0.99892,5384.17,0.619926,CostSensitiveRandomForestClassifier-weighted_v...,0.792453,0.509091,0.646439
8,0.001729,238527.5,0.003453,CostSensitiveRandomForestClassifier-stacking,0.001729,1.0,-14.663323
9,0.001729,238527.5,0.003453,CostSensitiveRandomForestClassifier-stacking_p...,0.001729,1.0,-14.663323


In [34]:
results.sort_values('Savings')

Unnamed: 0,Accuracy,Cost,F1,Name,Precision,Recall,Savings
16,0.001279,246752.43,0.00222,CostSensitiveBaggingClassifier-stacking,0.001112,0.642424,-15.203427
25,0.001729,238527.5,0.003453,CostSensitivePastingClassifier-stacking_proba,0.001729,1.0,-14.663323
24,0.001729,238527.5,0.003453,CostSensitivePastingClassifier-stacking,0.001729,1.0,-14.663323
8,0.001729,238527.5,0.003453,CostSensitiveRandomForestClassifier-stacking,0.001729,1.0,-14.663323
9,0.001729,238527.5,0.003453,CostSensitiveRandomForestClassifier-stacking_p...,0.001729,1.0,-14.663323
4,0.337204,163631.29,0.003121,CostSensitiveLogisticRegression,0.001565,0.6,-9.745133
19,0.993282,16052.53,0.009274,CostSensitiveBaggingClassifier-stacking_proba_bmr,0.006224,0.018182,-0.054117
26,0.993407,15715.18,0.012559,CostSensitivePastingClassifier-stacking_bmr,0.008475,0.024242,-0.031965
33,0.998271,15228.41,0.0,CostSensitiveRandomPatchesClassifier-stacking_...,0.0,0.0,0.0
32,0.998271,15228.41,0.0,CostSensitiveRandomPatchesClassifier-stacking,0.0,0.0,0.0


In [35]:
results.to_csv('Training-results.csv', index=False)

### TODO:
- Second jupyter notebook with results analysis
- Hyperparameter optimization (DT, RF, XGBoost, CS) - not computational extensive Grid Search
- Cross Validation (?)
- Rewrite this notebook to script?
- Make whole experiment with respect to differenct Operational Cost