In [1]:
import xgboost
import warnings

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from costcla.metrics import cost_loss, savings_score
from costcla.models import BayesMinimumRiskClassifier, ThresholdingOptimization
from costcla.models import CostSensitiveDecisionTreeClassifier, CostSensitiveLogisticRegression
from costcla.models import CostSensitiveRandomForestClassifier, CostSensitiveBaggingClassifier, CostSensitivePastingClassifier, CostSensitiveRandomPatchesClassifier

warnings.filterwarnings('ignore')
RANDOM_STATE = 42



In [2]:
def create_cost_matrix(df, fp_cost, fn_cost, tp_cost, tn_cost):
    # false positives, false negatives, true positives, true negatives
    def generate_cost(df, cost):
        return df[cost] if type(cost) == str else cost
    
    cost_matrix = np.zeros((df.shape[0], 4))
    
    cost_matrix[:, 0] = generate_cost(df, fp_cost)
    cost_matrix[:, 1] = generate_cost(df, fn_cost)
    cost_matrix[:, 2] = generate_cost(df, tp_cost)
    cost_matrix[:, 3] = generate_cost(df, tn_cost)
    
    return cost_matrix


def create_model_summary(model, name, x, y, cost_matrix):
    
    if 'bmr' in name:
        # BMR Model
        model, bmr = model
        y_hat_proba = model.predict_proba(x)
        y_hat = bmr.predict(y_hat_proba, cost_matrix_test)
    elif 'threshold-opt' in name:
        # Threshold Optimized Model
        model, threshold_opt = model
        y_hat_proba = model.predict_proba(x)
        y_hat = threshold_opt.predict(y_hat_proba)
    else:
        y_hat = model.predict(x)
        
        
    return {
        'Name': name,
        'Accuracy': accuracy_score(y, y_hat),
        'Precision': precision_score(y, y_hat),
        'Recall': recall_score(y, y_hat),
        'F1': f1_score(y, y_hat),
        'Cost': cost_loss(y, y_hat, cost_matrix),
        'Savings': savings_score(y, y_hat, cost_matrix)
    }


def create_bmr_model(model, name, x_val, y_val, calibration = True):
    
    y_hat_val_proba = model.predict_proba(X_val)

    bmr = BayesMinimumRiskClassifier(calibration = calibration)
    bmr.fit(y_val, y_hat_val_proba)
    
    name = name + '-bmr'
    name = name + '-calibration' if calibration else name
    
    return (name, (model, bmr))


def create_threshold_optimized_model(model, name, X_train, y_train, cost_matrix_train, calibration = True):
        
    y_hat_train_proba = model.predict_proba(X_train)

    threshold_opt = ThresholdingOptimization(calibration = calibration)
    threshold_opt.fit(y_hat_train_proba, cost_matrix_train, y_train)
    
    name = name + '-threshold-opt'
    name = name + '-calibration' if calibration else name
    
    return (name, (model, threshold_opt))

In [3]:
OPERATIONAL_COST = 2.5

In [4]:
df = pd.read_csv('data/creditcard.csv')
cost_matrix = create_cost_matrix(df, OPERATIONAL_COST, 'Amount', OPERATIONAL_COST, 0)

X = df.drop(['Time', 'Amount', 'Class'], axis = 1)
y = df['Class']

In [5]:
X_train, X_test, y_train, y_test, cost_matrix_train, cost_matrix_test = train_test_split(X, y, cost_matrix, train_size = 0.5, stratify = y, random_state = RANDOM_STATE)
X_val, X_test, y_val, y_test, cost_matrix_val, cost_matrix_test = train_test_split(X_test, y_test, cost_matrix_test, train_size = 0.33, stratify = y_test, random_state = RANDOM_STATE)

In [6]:
combinations = ['majority_voting']#, 'weighted_voting', 'stacking', 'stacking_proba', 'stacking_bmr', 'stacking_proba_bmr', 'majority_bmr', 'weighted_bmr']

csrfc = {f'CostSensitiveRandomForestClassifier-{combination}': CostSensitiveRandomForestClassifier(combination = combination) for combination in combinations}
#csbc = [CostSensitiveRandomForestClassifier(combination = combination) for combination in combinations]
#cspc = [CostSensitiveRandomForestClassifier(combination = combination) for combination in combinations]
#csrpc = [CostSensitiveRandomForestClassifier(combination = combination) for combination in combinations]
models = {
    'LogisticRegression': LogisticRegression(), 
    'DecisionTree': DecisionTreeClassifier(random_state=RANDOM_STATE), 
    'RandomForest': RandomForestClassifier(random_state=RANDOM_STATE),
    'XGBoost': xgboost.XGBClassifier(random_state=RANDOM_STATE),
    'CostSensitiveLogisticRegression': CostSensitiveLogisticRegression(),
    'CostSensitiveDecisionTreeClassifier': CostSensitiveDecisionTreeClassifier()
}
models.update(csrfc) #+ csbc + cspc + csrpc

In [7]:
# Regular training

for name in ['LogisticRegression', 'DecisionTree', 'RandomForest']:
    print(name)
    models[name].fit(X_train.values, y_train.values)

for name in ['CostSensitiveLogisticRegression', 'CostSensitiveDecisionTreeClassifier']:
    print(name)
    models[name].fit(X_train.values, y_train.values, cost_matrix_train)

print('XGBoost')
models['XGBoost'].fit(
    X_train.values, y_train.values, 
    eval_set = (X_val.values, y_val.values), 
    early_stopping_rounds = 50
)       

LogisticRegression
DecisionTree
RandomForest
CostSensitiveLogisticRegression
CostSensitiveDecisionTreeClassifier
CostSensitiveRandomForestClassifier-majority_voting


In [8]:
# Threshold Optimization training

for name in ['LogisticRegression', 'DecisionTree', 'RandomForest', 'XGBoost']:
    for calibration in [True, False]:
        model = models[name]
        name_threshold_opt, model_threshold_opt = create_threshold_optimized_model(model, name, X_train, y_train, cost_matrix_train, calibration = calibration)
        models[name_threshold_opt] = model_threshold_opt

In [9]:
# BMR training

for name in ['LogisticRegression', 'DecisionTree', 'RandomForest', 'XGBoost']:
    for calibration in [True, False]:
        model = models[name]
        name_bmr, model_bmr = create_bmr_model(model, name, X_val, y_val, calibration = calibration)
        models[name_bmr] = model_bmr

In [None]:
# Hyperparameter search for regular models

# XGBoost
# Cost Sensitive Ensembles

In [10]:
results = pd.DataFrame([create_model_summary(model, name, X_test.values, y_test.values, cost_matrix_test) for name, model in models.items()])
results

Unnamed: 0,Accuracy,Cost,F1,Name,Precision,Recall,Savings
0,0.999182,6169.57,0.734694,LogisticRegression,0.837209,0.654545,0.594864
1,0.999151,5708.01,0.750769,DecisionTree,0.7625,0.739394,0.625174
2,0.999465,5017.6,0.827119,RandomForest,0.938462,0.739394,0.670511
3,0.470145,127966.55,0.005821,CostSensitiveLogisticRegression,0.00292,0.89697,-7.403146
4,0.998952,4859.59,0.640288,CostSensitiveDecisionTreeClassifier,0.787611,0.539394,0.680887
5,0.998952,5828.2,0.621212,CostSensitiveRandomForestClassifier-majority_v...,0.828283,0.49697,0.617281
6,0.999141,6077.9,0.738854,LogisticRegression-threshold-opt-calibration,0.778523,0.70303,0.600884
7,0.997086,4628.03,0.501792,LogisticRegression-threshold-opt,0.356234,0.848485,0.696092
8,0.999151,5708.01,0.750769,DecisionTree-threshold-opt-calibration,0.7625,0.739394,0.625174
9,0.999151,5708.01,0.750769,DecisionTree-threshold-opt,0.7625,0.739394,0.625174
