In [1]:
import pandas as pd
import numpy as np
import os
from bokeh.io import output_notebook, show
output_notebook()
import bokeh as bh
from sklearn.preprocessing import LabelEncoder
import gc
from glob import glob
import matplotlib.pyplot as plt
import re
from math import ceil
import xgboost as xgb
from sklearn.model_selection import GroupKFold,KFold,StratifiedKFold,train_test_split,TimeSeriesSplit
import random
import operator
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
import datetime 
import fastai.tabular as ft
pd.options.display.max_columns = 1000

In [2]:
train_df = pd.read_csv('train_transaction.csv')

In [3]:
test_df = pd.read_csv('test_transaction.csv')

In [4]:
cols = list(train_df.columns)
for i in ['TransactionID','TransactionDT', 'isFraud']:
    cols.remove(i)

In [5]:
for f in cols:
    if train_df[f].dtype=='object':
        print(f+"-"+str(train_df[f].dtype))
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f].values) + list(test_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

ProductCD-object
card4-object
card6-object
P_emaildomain-object
R_emaildomain-object
M1-object
M2-object
M3-object
M4-object
M5-object
M6-object
M7-object
M8-object
M9-object


In [6]:
class MetricsMeter():    
    def __init__(self, y_true, y_pred, threshold):
        self.y_true = y_true        
        self.y_pred = y_pred        
        self.thresh = threshold            
    
    def fit(self):        
        metrics = {}        
        metrics['auc'] = roc_auc_score(self.y_true, self.y_pred)                
        self.y_pred = [1 if x > self.thresh else 0 for x in self.y_pred]        
        metrics['f1'] = f1_score(self.y_true, self.y_pred,average = 'macro')        
        tn, fp, fn, tp = confusion_matrix(self.y_true, self.y_pred).ravel()                
        metrics['sensitivity/recall'] = tp / (tp+fn)
        metrics['precision'] = tp / (tp+fp)
        metrics["accuracy"] = (tp + tn) / (tp + fp + fn + tn)
        metrics['specificity'] = tn / (tn+fp)
        return metrics

In [7]:
def RunXGB(x_build,x_val,y_build,y_val,dtest,seed):
    dbuild = xgb.DMatrix(x_build,y_build)
    dval = xgb.DMatrix(x_val,y_val)
    watchlist = [(dbuild, 'train'), (dval, 'val')]
    params = {'objective': 'binary:logistic',
              'booster': 'gbtree',
              'eval_metric': 'auc',
              'nthread': 96,
              'max_depth': 6,
              'learning_rate': 0.01,
              'subsample': 0.8,
              'min_child_weight': 1,
              "colsample_bytree": 0.9,
              'eta': 0.08,
              'verbose_eval': True,
              'silent':1,
              'seed': seed
              }
    clf_xgb = xgb.train(params, dbuild, num_boost_round=3000, verbose_eval=100, early_stopping_rounds=100, evals=watchlist)
    pred_val = clf_xgb.predict(dval, ntree_limit=clf_xgb.best_iteration)
    pred_t = clf_xgb.predict(dtest, ntree_limit=clf_xgb.best_iteration)
    return(clf_xgb,pred_val,pred_t)

In [8]:
train = train_df.reset_index(drop =True)
x_train = train[cols]
y_train = train['isFraud']
dtest = xgb.DMatrix(test_df[cols].iloc[:])
td = test_df['TransactionID'].reset_index(drop=True)

In [9]:
del train_df,test_df
gc.collect()

97

In [None]:
n_splits = 5
#x_stratify = x_train["hour"].values
cv_scores = []
pred_test_f = 0
pred_train = np.zeros(train.shape[0])
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=99)
for build_index, val_index in kf.split(x_train, y_train):
    x_build = x_train.iloc[build_index]
    y_build = y_train.iloc[build_index]
    x_val = x_train.iloc[val_index]
    y_val = y_train.iloc[val_index]
    pred_val = 0
    pred_test = 0
    n_models = 0.
    
    model,pred_v,pred_t = RunXGB(x_build,x_val,y_build,y_val,dtest,seed = 10999973) 
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
#     model,pred_v,pred_t = RunXGB(x_build,x_val,y_build,y_val,dtest,seed = 99) 
#     pred_val += pred_v
#     pred_test += pred_t
#     n_models += 1

    
#     model,pred_v,pred_t = RunXGB(x_build,x_val,y_build,y_val,dtest,seed = 90351) 
#     pred_val += pred_v
#     pred_test += pred_t
#     n_models += 1
    
    pred_val /= n_models
    pred_test /= n_models

    
    pred_train[val_index] = pred_val
    print(min(pred_train)),print(max(pred_train))
    pred_test_f += pred_test / n_splits
    print(min(pred_test_f)),print(max(pred_test_f))
    fig, ax = plt.subplots(figsize=(12,18))
    xgb.plot_importance(model,importance_type= "gain", max_num_features=50, height=0.8, ax=ax)
    plt.show()
    #xgb_preds_lst.append(clf_xgb.predict(dtest))  
    #models.append(clf_xgb)

  if getattr(data, 'base', None) is not None and \


[0]	train-auc:0.787374	val-auc:0.785268
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 100 rounds.
[100]	train-auc:0.857676	val-auc:0.854462
[200]	train-auc:0.869277	val-auc:0.865183
[300]	train-auc:0.886471	val-auc:0.883185
[400]	train-auc:0.897267	val-auc:0.893431
[500]	train-auc:0.906241	val-auc:0.90127
[600]	train-auc:0.913618	val-auc:0.907554
[700]	train-auc:0.920205	val-auc:0.913388
[800]	train-auc:0.924593	val-auc:0.91738
[900]	train-auc:0.92755	val-auc:0.919839
[1000]	train-auc:0.930318	val-auc:0.922092
[1100]	train-auc:0.933147	val-auc:0.924389
[1200]	train-auc:0.935312	val-auc:0.92608
[1300]	train-auc:0.937766	val-auc:0.928011
[1400]	train-auc:0.93992	val-auc:0.929759
[1500]	train-auc:0.941861	val-auc:0.931253
[1600]	train-auc:0.943603	val-auc:0.932667
[1700]	train-auc:0.945427	val-auc:0.934106
[1800]	train-auc:0.947069	val-auc:0.935407
[1900]	train-auc:0.948504	val-auc:0.936516
[2000]	train-auc:

In [None]:
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model,importance_type= "gain", max_num_features=50, height=0.8, ax=ax)
plt.show()