<a href="https://colab.research.google.com/github/mohitkhannanu/ml-01/blob/master/Credit_Card_Fraud_Bayesian_xgboost_3__Drop_Gini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install bayesian-optimization

In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from datetime import datetime

import xgboost as xgb
from sklearn.metrics import roc_auc_score

# Bayesian Optimization
# Importing necessary libraries
from bayes_opt import BayesianOptimization
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [None]:
# Loading Dataset
data = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')

In [None]:
# split data into X and y
X = data.iloc[:,1:29]
Y_cont = data.iloc[:,29]
Y_class = data.iloc[:,30]

In [None]:
# create 3 datasets - In Sample, Hold out, Validation data
# 70%, 15%, 15%
# In Sample

X_train, X_2, Y_cont_train, Y_cont_2, Y_class_train, Y_class_2 = train_test_split(X, Y_cont, Y_class, test_size=0.3, random_state=7)

# X_train.shape #(199364, 28)
# X_2.shape #(85443, 28)
# Y_cont_train.shape #(199364,)
# Y_cont_2.shape #(85443,)
# Y_class_train.shape #(199364, 28)
# Y_class_2.shape #(85443,)

X_hold_out, X_val, Y_cont_hold_out, Y_cont_val, Y_class_hold_out, Y_class_val = train_test_split(X_2, Y_cont_2, Y_class_2, test_size=0.5, random_state=7)

# X_hold_out.shape #(42721, 28)
# X_val.shape #(42722, 28)
# Y_cont_hold_out.shape #(42721,)
# Y_cont_val.shape #(42722,)
# Y_class_hold_out.shape #(42721, 28)
# Y_class_val.shape #(42722,)

In [None]:
# Gini definition
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [None]:
# Bayesian Optimization function for xgboost
# Specify the parameters you want to tune as keyword arguments
starttime = datetime.now()
def bo_tune_xgb(max_depth, n_estimators, learning_rate):
    """
    params = {'max_depth': int(max_depth),
              #'gamma': gamma,
              'n_estimators': int(n_estimators),
              'learning_rate':learning_rate,
              #'subsample': 0.8,
              #'eta': 0.1,
              'eval_metric': 'error'}
    """
    test_model = XGBClassifier(booster='gbtree',
                  learning_rate=learning_rate, max_depth=int(max_depth),
                  n_estimators=int(n_estimators), n_jobs=1,
                  nthread=None, objective='binary:logistic', random_state=0,
                  seed=None,
                  silent=None,verbosity=1)
    test_model.fit(X_train, Y_class_train)

    #Return the Gini drop
    
    # Insample Gini
    y_pred1  = test_model.predict(X_train)
    predictions1 = [round(value) for value in y_pred1]
    train_gini = gini_normalized(Y_class_train, predictions1)

    # Hold Out sample Gini
    y_pred2  = test_model.predict(X_hold_out)
    predictions2 = [round(value) for value in y_pred2]
    test_gini = gini_normalized(Y_class_hold_out, predictions2)

    drop_gini = train_gini - test_gini # Drop in Gini across samples
    return drop_gini * -1
    
#Invoking the Bayesian Optimizer with the specified parameters to tune
xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (2,10),
                                             #'gamma': (0, 1),
                                             'learning_rate':(0.05, 0.3),
                                             'n_estimators':(100,600)
                                            })

#performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement
xgb_bo.maximize(n_iter=15, init_points=10, acq='ei')

#Extracting the best parameters
params = xgb_bo.max['params']
print(params)
print(datetime.now() - starttime) #0:57:46.245054

|   iter    |  target   | learni... | max_depth | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m-0.1677  [0m | [0m 0.07614 [0m | [0m 3.734   [0m | [0m 584.1   [0m |
| [95m 2       [0m | [95m-0.162   [0m | [95m 0.234   [0m | [95m 9.599   [0m | [95m 508.7   [0m |
| [0m 3       [0m | [0m-0.1805  [0m | [0m 0.0656  [0m | [0m 5.4     [0m | [0m 523.5   [0m |
| [0m 4       [0m | [0m-0.1805  [0m | [0m 0.1234  [0m | [0m 4.157   [0m | [0m 305.3   [0m |
| [0m 5       [0m | [0m-0.162   [0m | [0m 0.07687 [0m | [0m 6.034   [0m | [0m 372.0   [0m |
| [95m 6       [0m | [95m 0.007567[0m | [95m 0.08268 [0m | [95m 2.798   [0m | [95m 328.9   [0m |
| [0m 7       [0m | [0m-0.1704  [0m | [0m 0.2622  [0m | [0m 7.99    [0m | [0m 286.9   [0m |
| [0m 8       [0m | [0m-0.162   [0m | [0m 0.2694  [0m | [0m 9.298   [0m | [0m 325.7   [0m |
| [0m 9       [0m | [0m-0.154   [0m | [0m 0.2

In [None]:
#test accuracy
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=356, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
model.fit(X_train, Y_class_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=356, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
# make predictions for test data
y_pred  = model.predict(X_val)
#y_pred  = model.predict(X_hold_out)
#y_pred  = model.predict(X_train)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(Y_class_val, predictions)
#accuracy = accuracy_score(Y_class_hold_out, predictions)
#accuracy = accuracy_score(Y_class_train, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.95%


In [None]:
# Gini calculations
gini_predictions = gini(Y_class_val, predictions)
gini_max = gini(Y_class_val, Y_class_val)
ngini= gini_normalized(Y_class_val, predictions)

#gini_predictions = gini(Y_class_hold_out, predictions)
#gini_max = gini(Y_class_hold_out, Y_class_hold_out)
#ngini= gini_normalized(Y_class_hold_out, predictions)

#gini_predictions = gini(Y_class_train, predictions)
#gini_max = gini(Y_class_train, Y_class_train)
#ngini= gini_normalized(Y_class_train, predictions)

print('Gini: %.3f, Max. Gini: %.3f, Normalized Gini: %.3f' % (gini_predictions, gini_max, ngini))

# Grid Search results
# In Sample------------------ 0.816
# Hold Out------------------- 0.842
# Val-------------------------0.783

# Bayesian
# Val-------------------------0.770

Gini: 0.385, Max. Gini: 0.499, Normalized Gini: 0.770
