<a href="https://colab.research.google.com/github/mohitkhannanu/ml-01/blob/master/Credit_Card_Fraud_basic_xgboost_(Gini_Drop).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Importing Libraries
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from datetime import datetime

In [9]:
# Loading Dataset
data = pd.read_csv('https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv')

In [10]:
# split data into X and y
X = data.iloc[:,1:29]
Y_cont = data.iloc[:,29]
Y_class = data.iloc[:,30]

In [11]:
# create 3 datasets - In Sample, Hold out, Validation data
# 70%, 15%, 15%
# In Sample

X_train, X_2, Y_cont_train, Y_cont_2, Y_class_train, Y_class_2 = train_test_split(X, Y_cont, Y_class, test_size=0.3, random_state=7)

#X_train.shape #(199364, 28)
#X_2.shape #(85443, 28)
#Y_cont_train.shape #(199364,)
#Y_cont_2.shape #(85443,)
#Y_class_train.shape #(199364, 28)
#Y_class_2.shape #(85443,)

X_hold_out, X_val, Y_cont_hold_out, Y_cont_val, Y_class_hold_out, Y_class_val = train_test_split(X_2, Y_cont_2, Y_class_2, test_size=0.5, random_state=7)

#X_hold_out.shape #(42721, 28)
#X_val.shape #(42722, 28)
#Y_cont_hold_out.shape #(42721,)
#Y_cont_val.shape #(42722,)
#Y_class_hold_out.shape #(42721, 28)
#Y_class_val.shape #(42722,)

In [13]:
# Gini definition
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [28]:
# Parameter fine Tuning
starttime = datetime.now()
model = XGBClassifier()

param_dist = {"max_depth": [2],
              #"min_child_weight" : [1,3,6],
              "learning_rate": [0.05, 0.1],
              "n_estimators": [100,200]
              #"reg_lambda": [1,5,10],
              }           
gini_scorer = metrics.make_scorer(gini_normalized, greater_is_better = True)
grid_search = GridSearchCV(scoring = gini_scorer, estimator = model, param_grid=param_dist, cv = 3, 
                                   verbose=10, n_jobs=-1, return_train_score=True)
grid_search.fit(X_hold_out, Y_class_hold_out)
grid_search.best_estimator_

print(datetime.now() - starttime) #0:08:33.140362

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   20.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.0min finished


0:01:12.552378


In [30]:
print("Grid scores on development set:")
means = grid_search.cv_results_['mean_train_score']
stds = grid_search.cv_results_['std_train_score']
for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
  print("%0.3f (+/-%0.03f) for %r"
  % (mean, std * 2, params))

Grid scores on development set:
0.861 (+/-0.027) for {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 100}
0.866 (+/-0.018) for {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}
0.866 (+/-0.018) for {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}
0.971 (+/-0.004) for {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 200}


In [27]:
grid_search.cv_results_.items()

dict_items([('mean_fit_time', array([ 6.4286619 , 12.63631312,  6.37223172, 10.50976713])), ('std_fit_time', array([0.03707264, 0.10350583, 0.01521197, 2.323019  ])), ('mean_score_time', array([0.04589105, 0.07934014, 0.04344487, 0.08034452])), ('std_score_time', array([0.00407202, 0.01348259, 0.00303492, 0.01297646])), ('param_learning_rate', masked_array(data=[0.05, 0.05, 0.1, 0.1],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object)), ('param_max_depth', masked_array(data=[2, 2, 2, 2],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object)), ('param_n_estimators', masked_array(data=[100, 200, 100, 200],
             mask=[False, False, False, False],
       fill_value='?',
            dtype=object)), ('params', [{'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 100}, {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 200}, {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100}, {'l

In [None]:
#test accuracy
model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)
model.fit(X_train, Y_class_train)

In [None]:
# make predictions for test data
#y_pred  = model.predict(X_val)
#y_pred  = model.predict(X_hold_out)
y_pred  = model.predict(X_train)
predictions = [round(value) for value in y_pred]
# evaluate predictions
#accuracy = accuracy_score(Y_class_val, predictions)
#accuracy = accuracy_score(Y_class_hold_out, predictions)
accuracy = accuracy_score(Y_class_train, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.96%


In [None]:
# Gini calculations
#gini_predictions = gini(Y_class_val, predictions)
#gini_max = gini(Y_class_val, Y_class_val)
#ngini= gini_normalized(Y_class_val, predictions)

#gini_predictions = gini(Y_class_hold_out, predictions)
#gini_max = gini(Y_class_hold_out, Y_class_hold_out)
#ngini= gini_normalized(Y_class_hold_out, predictions)

gini_predictions = gini(Y_class_train, predictions)
gini_max = gini(Y_class_train, Y_class_train)
ngini= gini_normalized(Y_class_train, predictions)

print('Gini: %.3f, Max. Gini: %.3f, Normalized Gini: %.3f' % (gini_predictions, gini_max, ngini))

# In Sample------------------ 0.816
# Hold Out------------------- 0.842
# Val-------------------------0.783

Gini: 0.407, Max. Gini: 0.499, Normalized Gini: 0.816


In [None]:
# Gini Drop Hold Out to Val
(0.783/0.842-1)*100

-7.007125890736332