In [1]:
import pandas as pd
import numpy as np
import os
from bokeh.io import output_notebook, show
output_notebook()
import bokeh as bh
from sklearn.preprocessing import LabelEncoder
import gc
from glob import glob
import re
from math import ceil
import xgboost as xgb
from sklearn.model_selection import GroupKFold,KFold,StratifiedKFold,train_test_split
import random
import operator
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
import fastai.tabular as ft
pd.options.display.max_columns = 1000

In [2]:
train_id = pd.read_csv('train_identity.csv')
train_trans = pd.read_csv('train_transaction.csv')
train_df = train_trans.merge(train_id,how='left',on=['TransactionID'])

In [3]:
test_id = pd.read_csv('test_identity.csv')
test_trans = pd.read_csv('test_transaction.csv')
test_df = test_trans.merge(test_id,how='left',on=['TransactionID'])

In [4]:
del train_id,train_trans,test_id,test_trans
gc.collect()

71

In [5]:
set(train_df['P_emaildomain'])

{'aim.com',
 'anonymous.com',
 'aol.com',
 'att.net',
 'bellsouth.net',
 'cableone.net',
 'centurylink.net',
 'cfl.rr.com',
 'charter.net',
 'comcast.net',
 'cox.net',
 'earthlink.net',
 'embarqmail.com',
 'frontier.com',
 'frontiernet.net',
 'gmail',
 'gmail.com',
 'gmx.de',
 'hotmail.co.uk',
 'hotmail.com',
 'hotmail.de',
 'hotmail.es',
 'hotmail.fr',
 'icloud.com',
 'juno.com',
 'live.com',
 'live.com.mx',
 'live.fr',
 'mac.com',
 'mail.com',
 'me.com',
 'msn.com',
 nan,
 'netzero.com',
 'netzero.net',
 'optonline.net',
 'outlook.com',
 'outlook.es',
 'prodigy.net.mx',
 'protonmail.com',
 'ptd.net',
 'q.com',
 'roadrunner.com',
 'rocketmail.com',
 'sbcglobal.net',
 'sc.rr.com',
 'servicios-ta.com',
 'suddenlink.net',
 'twc.com',
 'verizon.net',
 'web.de',
 'windstream.net',
 'yahoo.co.jp',
 'yahoo.co.uk',
 'yahoo.com',
 'yahoo.com.mx',
 'yahoo.de',
 'yahoo.es',
 'yahoo.fr',
 'ymail.com'}

In [6]:
# emails = {'yahoo':'Yahoo', 'ymail':'Yahoo', 'frontier':'Yahoo', 'rocketmail':'Yahoo',
# 'hotmail':'Microsoft', 'outlook':'Microsoft','live':'Microsoft', 'msn':'Microsoft',

# 'icloud':'Apple','mac':'Apple','me':'Apple',

# prodigy / att / sbcglobal-> AT&T

# centurylink / embarqmail / q -> Centurylink

# aim / aol -> AOL

# twc / charter -> Spectrum}

In [7]:
# #https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest_df-579654
# for c in ['P_emaildomain', 'R_emaildomain']:
#     train_df[c + '_bin'] = train_df[c].map(emails)
#     test_df[c + '_bin'] = test_df[c].map(emails)
    
#     train_df[c + '_suffix'] = train_df[c].map(lambda x: str(x).split('.')[-1])
#     test_df[c + '_suffix'] = test_df[c].map(lambda x: str(x).split('.')[-1])
    
#     train_df[c + '_suffix'] = train_df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
#     test_df[c + '_suffix'] = test_df[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [8]:
cols = list(train_df.columns)
for i in ['isFraud','TransactionDT']:
    cols.remove(i)

In [9]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [10]:
train_mv = missing_data(train_df)
test_mv = missing_data(test_df)

In [11]:
test_mv['dist2']

Total       470255
Percent     92.809
Types      float64
Name: dist2, dtype: object

In [12]:
col_exp_lst = []
for i in cols:
    if train_mv[i]['Percent'] > 90 and test_mv[i]['Percent'] >90:
        col_exp_lst.append(i)

In [13]:
for i in col_exp_lst:
    cols.remove(i)

In [14]:
train_df['TransactionAmt_decimal'] = ((train_df['TransactionAmt'] - train_df['TransactionAmt'].astype(int)) * 1000).astype(int)
test_df['TransactionAmt_decimal'] = ((test_df['TransactionAmt'] - test_df['TransactionAmt'].astype(int)) * 1000).astype(int)

# Count encoding for card1 feature. 
# Explained in this kernel: https://www.kaggle.com/nroman/eda-for-cis-fraud-detection
train_df['card1_count_full'] = train_df['card1'].map(pd.concat([train_df['card1'], test_df['card1']], ignore_index=True).value_counts(dropna=False))
test_df['card1_count_full'] = test_df['card1'].map(pd.concat([train_df['card1'], test_df['card1']], ignore_index=True).value_counts(dropna=False))

# https://www.kaggle.com/fchmiel/day-and-time-powerful-predictive-feature
train_df['Transaction_day_of_week'] = np.floor((train_df['TransactionDT'] / (3600 * 24) - 1) % 7)
test_df['Transaction_day_of_week'] = np.floor((test_df['TransactionDT'] / (3600 * 24) - 1) % 7)
train_df['Transaction_hour'] = np.floor(train_df['TransactionDT'] / 3600) % 24
test_df['Transaction_hour'] = np.floor(test_df['TransactionDT'] / 3600) % 24

# Some arbitrary features interaction
for feature in ['id_02__id_20', 'id_02__D8', 'D11__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__C2', 
                'card2__dist1', 'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:

    f1, f2 = feature.split('__')
    train_df[feature] = train_df[f1].astype(str) + '_' + train_df[f2].astype(str)
    test_df[feature] = test_df[f1].astype(str) + '_' + test_df[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train_df[feature].astype(str).values) + list(test_df[feature].astype(str).values))
    train_df[feature] = le.transform(list(train_df[feature].astype(str).values))
    test_df[feature] = le.transform(list(test_df[feature].astype(str).values))
    
for feature in ['id_34', 'id_36']:
    if feature in cols:
        # Count encoded for both train and test
        train_df[feature + '_count_full'] = train_df[feature].map(pd.concat([train_df[feature], test_df[feature]], ignore_index=True).value_counts(dropna=False))
        test_df[feature + '_count_full'] = test_df[feature].map(pd.concat([train_df[feature], test_df[feature]], ignore_index=True).value_counts(dropna=False))
        
for feature in ['id_01', 'id_31', 'id_33', 'id_35', 'id_36']:
    if feature in cols:
        # Count encoded separately for train and test
        train_df[feature + '_count_dist'] = train_df[feature].map(train_df[feature].value_counts(dropna=False))
        test_df[feature + '_count_dist'] = test_df[feature].map(test_df[feature].value_counts(dropna=False))


In [15]:
cols_for_model = list(train_df.columns)
col_exp_lst.extend(['isFraud','TransactionDT'])
for i in col_exp_lst:
    print(i)
    cols_for_model.remove(i)

dist2
id_07
id_08
id_21
id_22
id_23
id_24
id_25
id_26
id_27
isFraud
TransactionDT


In [16]:
for f in cols_for_model:
    if train_df[f].dtype=='object':
        print(f+"-"+str(train_df[f].dtype))
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f].values) + list(test_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))

ProductCD-object
card4-object
card6-object
P_emaildomain-object
R_emaildomain-object
M1-object
M2-object
M3-object
M4-object
M5-object
M6-object
M7-object
M8-object
M9-object
id_12-object
id_15-object
id_16-object
id_28-object
id_29-object
id_30-object
id_31-object
id_33-object
id_34-object
id_35-object
id_36-object
id_37-object
id_38-object
DeviceType-object
DeviceInfo-object


In [17]:
# build_df = train_df[train_df['TransactionDT'] < 13800000].reset_index()
# val_df = train_df[train_df['TransactionDT'] >= 13800000].reset_index()

In [18]:
class MetricsMeter():    
    def __init__(self, y_true, y_pred, threshold):
        self.y_true = y_true        
        self.y_pred = y_pred        
        self.thresh = threshold            
    
    def fit(self):        
        metrics = {}        
        metrics['auc'] = roc_auc_score(self.y_true, self.y_pred)                
        self.y_pred = [1 if x > self.thresh else 0 for x in self.y_pred]        
        metrics['f1'] = f1_score(self.y_true, self.y_pred,average = 'macro')        
        tn, fp, fn, tp = confusion_matrix(self.y_true, self.y_pred).ravel()                
        metrics['sensitivity/recall'] = tp / (tp+fn)
        metrics['precision'] = tp / (tp+fp)
        metrics["accuracy"] = (tp + tn) / (tp + fp + fn + tn)
        metrics['specificity'] = tn / (tn+fp)
        return metrics

In [19]:
def RunXGB(x_build,x_val,y_build,y_val,dtest,seed):
    dbuild = xgb.DMatrix(x_build,y_build)
    dval = xgb.DMatrix(x_val,y_val)
    watchlist = [(dbuild, 'train'), (dval, 'val')]
    params = {'objective': 'binary:logistic',
              'booster': 'gbtree',
              'eval_metric': 'auc',
              'nthread': 64,
              'max_depth': 6,
              #'learning_rate': 0.009
              'subsample': 0.8,
              'min_child_weight': 1,
              "colsample_bytree": 0.9,
              'eta': 0.08,
              'verbose_eval': True,
              'silent':1,
              'seed': seed
              }
    clf_xgb = xgb.train(params, dbuild, num_boost_round=1000, verbose_eval=50, early_stopping_rounds=30, evals=watchlist)
    pred_val = clf_xgb.predict(dval, ntree_limit=clf_xgb.best_iteration)
    pred_t = clf_xgb.predict(dtest, ntree_limit=clf_xgb.best_iteration)
    return(clf_xgb,pred_val,pred_t)

In [20]:
# model1,pred_v1,pred_t1 = RunXGB(build_df[cols].iloc[:],val_df[cols].iloc[:],
# build_df['isFraud'].iloc[:],val_df['isFraud'].iloc[:]),seed = 9999)

In [21]:
train = train_df.reset_index(drop =True)
x_train = train[cols_for_model]
y_train = train['isFraud']
dtest = xgb.DMatrix(test_df[cols_for_model].iloc[:])
td = test_df['TransactionID'].reset_index(drop=True)

In [22]:
del train_df,test_df
gc.collect()

173

In [None]:
n_splits = 5
cv_scores = []
pred_test_f = 0
pred_train = np.zeros(train.shape[0])
kf = KFold(n_splits=n_splits, shuffle=True, random_state=99)
for build_index, val_index in kf.split(x_train, y_train):
    x_build = x_train.iloc[build_index]
    y_build = y_train.iloc[build_index]
    x_val = x_train.iloc[val_index]
    y_val = y_train.iloc[val_index]
    pred_val = 0
    pred_test = 0
    n_models = 0.
    
    model,pred_v,pred_t = RunXGB(x_build,x_val,y_build,y_val,dtest,seed = 10999973) 
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
    model,pred_v,pred_t = RunXGB(x_build,x_val,y_build,y_val,dtest,seed = 99) 
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1

    
    model,pred_v,pred_t = RunXGB(x_build,x_val,y_build,y_val,dtest,seed = 90351) 
    pred_val += pred_v
    pred_test += pred_t
    n_models += 1
    
    pred_val /= n_models
    pred_test /= n_models

    
    pred_train[val_index] = pred_val
    print(min(pred_train)),print(max(pred_train))
    pred_test_f += pred_test / n_splits
    print(min(pred_test_f)),print(max(pred_test_f))
    #xgb_preds_lst.append(clf_xgb.predict(dtest))  
    #models.append(clf_xgb)

  if getattr(data, 'base', None) is not None and \


[0]	train-auc:0.790447	val-auc:0.792069
Multiple eval metrics have been passed: 'val-auc' will be used for early stopping.

Will train until val-auc hasn't improved in 30 rounds.
[50]	train-auc:0.902252	val-auc:0.895724
[100]	train-auc:0.929282	val-auc:0.919691
[150]	train-auc:0.941395	val-auc:0.929623
[200]	train-auc:0.949652	val-auc:0.935695
[250]	train-auc:0.955857	val-auc:0.940212
[300]	train-auc:0.961473	val-auc:0.944142
[350]	train-auc:0.965689	val-auc:0.94728
[400]	train-auc:0.969462	val-auc:0.949776
[450]	train-auc:0.973024	val-auc:0.952158
[500]	train-auc:0.976025	val-auc:0.954421
[550]	train-auc:0.978245	val-auc:0.955998
[600]	train-auc:0.980407	val-auc:0.957334
[650]	train-auc:0.98212	val-auc:0.95852
[700]	train-auc:0.983903	val-auc:0.959865
[750]	train-auc:0.985517	val-auc:0.961025
[800]	train-auc:0.987012	val-auc:0.962098
[850]	train-auc:0.988224	val-auc:0.962862
[900]	train-auc:0.989501	val-auc:0.963831
[950]	train-auc:0.990582	val-auc:0.964865
[999]	train-auc:0.991461	va

In [None]:
pred_df = pd.DataFrame(pred_test ,columns = ['isFraud'])

In [None]:
fnl = pd.concat([td.reset_index(drop=True), pred_df], axis=1)

In [None]:
fnl.head(10)

In [None]:
fnl.to_csv("fraud_preds_4.csv",index=False)

In [None]:
from google.colab import files

fnl.to_csv('fnl_part4.csv',index = False)
files.download('fnl_part4.csv')