<a href="https://colab.research.google.com/github/mohitkhannanu/ml-01/blob/master/Credit_Card_Fraud_Grid_Search_Extensive_xgboost_Drop_Gini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from datetime import datetime

In [3]:
# Loading Dataset
data = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')

In [4]:
# split data into X and y
X = data.iloc[:,1:29]
Y_cont = data.iloc[:,29]
Y_class = data.iloc[:,30]

In [5]:
# create 3 datasets - In Sample, Hold out, Validation data
# 70%, 15%, 15%
# In Sample

X_train, X_2, Y_cont_train, Y_cont_2, Y_class_train, Y_class_2 = train_test_split(X, Y_cont, Y_class, test_size=0.3, random_state=7)

#X_train.shape #(199364, 28)
#X_2.shape #(85443, 28)
#Y_cont_train.shape #(199364,)
#Y_cont_2.shape #(85443,)
#Y_class_train.shape #(199364, 28)
#Y_class_2.shape #(85443,)

X_hold_out, X_val, Y_cont_hold_out, Y_cont_val, Y_class_hold_out, Y_class_val = train_test_split(X_2, Y_cont_2, Y_class_2, test_size=0.5, random_state=7)

#X_hold_out.shape #(42721, 28)
#X_val.shape #(42722, 28)
#Y_cont_hold_out.shape #(42721,)
#Y_cont_val.shape #(42722,)
#Y_class_hold_out.shape #(42721, 28)
#Y_class_val.shape #(42722,)

In [6]:
# Gini definition
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [None]:
# Parameter fine Tuning
starttime = datetime.now()
model = XGBClassifier()

param_dist = {"max_depth": [2,3,4,5,6,7,8,9,10],
              #"min_child_weight" : [1,3,6],
              "learning_rate": [0.05,0.1,0.2,0.3],
              "n_estimators": [100,200,300,400,500,600]
              #"reg_lambda": [1,5,10],
              }           
gini_scorer = metrics.make_scorer(gini_normalized, greater_is_better = True)
grid_search = GridSearchCV(scoring = gini_scorer, estimator = model, param_grid=param_dist, cv = 3, 
                                   verbose=10, n_jobs=-1, return_train_score=True)
grid_search.fit(X_hold_out, Y_class_hold_out)
grid_search.best_estimator_

print(datetime.now() - starttime) #1:48:25.617153

In [None]:
"""
print("Grid scores on development set:")
means = grid_search.cv_results_['mean_train_score']
stds = grid_search.cv_results_['std_train_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
  print("%0.3f : %r"
  % (mean, params))
"""

print("Grid scores on development set:")
means = grid_search.cv_results_['mean_test_score']
stds = grid_search.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
  print("%0.3f : %r"
  % (mean, params))


In [37]:
import csv
with open('test_results', 'w', newline='') as myfile:
     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
     wr.writerow(grid_search.cv_results_['params'])
     wr.writerow(means)

In [38]:
#test accuracy
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
model.fit(X_train, Y_class_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [39]:
# make predictions for test data
y_pred  = model.predict(X_val)
#y_pred  = model.predict(X_hold_out)
#y_pred  = model.predict(X_train)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(Y_class_val, predictions)
#accuracy = accuracy_score(Y_class_hold_out, predictions)
#accuracy = accuracy_score(Y_class_train, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.96%


In [40]:
# Gini calculations
gini_predictions = gini(Y_class_val, predictions)
gini_max = gini(Y_class_val, Y_class_val)
ngini= gini_normalized(Y_class_val, predictions)

#gini_predictions = gini(Y_class_hold_out, predictions)
#gini_max = gini(Y_class_hold_out, Y_class_hold_out)
#ngini= gini_normalized(Y_class_hold_out, predictions)

#gini_predictions = gini(Y_class_train, predictions)
#gini_max = gini(Y_class_train, Y_class_train)
#ngini= gini_normalized(Y_class_train, predictions)

print('Gini: %.3f, Max. Gini: %.3f, Normalized Gini: %.3f' % (gini_predictions, gini_max, ngini))

# Grid Search results
# In Sample------------------ 0.816
# Hold Out------------------- 0.842
# Val-------------------------0.783

# Bayesian
# Val-------------------------0.770

Gini: 0.391, Max. Gini: 0.499, Normalized Gini: 0.783
