# PREDICT FRAUDULENT TRANSACTIONS

In [21]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

In [22]:
TrainDataPath = './BW2017_2/train.csv'
TestDataPath = './BW2017_2/test.csv'

TrainData = pd.read_csv(TrainDataPath)
TestData = pd.read_csv(TestDataPath)

In [23]:
print("Training Dataset Shape:")
print(TrainData.shape)
print("\n")
print("Training Dataset Columns/Features:")
print(TrainData.dtypes)
TrainData.head(3)

Training Dataset Shape:
(348978, 51)


Training Dataset Columns/Features:
transaction_id     object
num_var_1         float64
num_var_2         float64
num_var_3         float64
num_var_4         float64
num_var_5         float64
num_var_6         float64
num_var_7         float64
cat_var_1          object
cat_var_2          object
cat_var_3          object
cat_var_4          object
cat_var_5          object
cat_var_6          object
cat_var_7          object
cat_var_8          object
cat_var_9          object
cat_var_10         object
cat_var_11         object
cat_var_12         object
cat_var_13         object
cat_var_14         object
cat_var_15         object
cat_var_16         object
cat_var_17         object
cat_var_18         object
cat_var_19          int64
cat_var_20          int64
cat_var_21          int64
cat_var_22          int64
cat_var_23          int64
cat_var_24          int64
cat_var_25          int64
cat_var_26          int64
cat_var_27          int64
cat_var_28      

Unnamed: 0,transaction_id,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,cat_var_1,cat_var_2,...,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42,target
0,id_11,2.302632e-08,0.040182,0.0,1.8e-07,2.302632e-08,2.368421e-08,1.115205e-08,,ce,...,0,0,0,0,0,0,0,0,0,0
1,id_33,7.965789e-06,0.157872,0.0,2.105e-06,2.769737e-07,7.965789e-06,2.433058e-06,da,tn,...,0,0,0,0,0,0,0,0,0,0
2,id_51,7.828947e-08,0.08914,0.0,3.55e-07,4.671053e-08,1.052632e-07,4.276014e-07,gf,ce,...,0,0,0,0,0,0,0,0,0,0


In [24]:
print("Test Dataset Shape:")
print(TestData.shape)
print("\n")
print("Test Dataset Columns/Features:")
print(TestData.dtypes)
TestData.head()

Test Dataset Shape:
(523466, 50)


Test Dataset Columns/Features:
transaction_id     object
num_var_1         float64
num_var_2         float64
num_var_3         float64
num_var_4         float64
num_var_5         float64
num_var_6         float64
num_var_7         float64
cat_var_1          object
cat_var_2          object
cat_var_3          object
cat_var_4          object
cat_var_5          object
cat_var_6          object
cat_var_7          object
cat_var_8          object
cat_var_9          object
cat_var_10         object
cat_var_11         object
cat_var_12         object
cat_var_13         object
cat_var_14         object
cat_var_15         object
cat_var_16         object
cat_var_17         object
cat_var_18         object
cat_var_19          int64
cat_var_20          int64
cat_var_21          int64
cat_var_22          int64
cat_var_23          int64
cat_var_24          int64
cat_var_25          int64
cat_var_26          int64
cat_var_27          int64
cat_var_28          int6

Unnamed: 0,transaction_id,num_var_1,num_var_2,num_var_3,num_var_4,num_var_5,num_var_6,num_var_7,cat_var_1,cat_var_2,...,cat_var_33,cat_var_34,cat_var_35,cat_var_36,cat_var_37,cat_var_38,cat_var_39,cat_var_40,cat_var_41,cat_var_42
0,id_1,4.736842e-07,0.162737,0.0,2.105e-06,2.769737e-07,4.828947e-07,1.30001e-07,da,tn,...,0,0,0,0,0,0,0,0,0,0
1,id_6,6.578947e-08,0.086391,0.0,7.5e-07,1.315789e-07,9.868421e-08,3.1863e-08,pu,ce,...,0,0,0,0,0,0,0,0,0,0
2,id_9,1.585526e-07,0.159987,0.0,2.105e-06,2.769737e-07,1.585526e-07,4.715724e-08,da,tn,...,0,0,0,0,0,0,0,0,0,0
3,id_14,8.552632e-08,0.088083,0.0,7.5e-07,8.552632e-08,9.868421e-08,4.14219e-08,pu,ce,...,0,0,0,0,0,0,0,0,0,0
4,id_15,1.144737e-07,0.089563,0.0,3.55e-07,4.671053e-08,4.407895e-08,1.452953e-07,gf,ce,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# Now Checking For Any Missing Values
print("Training Dataset Missing Value Columns:")
print(TrainData.isnull().sum())
print("\n")
print("Test Dataset Missing Value Columns:")
print(TestData.isnull().sum())

Training Dataset Missing Value Columns:
transaction_id         0
num_var_1              0
num_var_2              0
num_var_3              0
num_var_4              0
num_var_5              0
num_var_6              0
num_var_7              0
cat_var_1          15782
cat_var_2              0
cat_var_3          43853
cat_var_4              0
cat_var_5              0
cat_var_6              0
cat_var_7              0
cat_var_8         109738
cat_var_9              0
cat_var_10             0
cat_var_11             0
cat_var_12             0
cat_var_13             0
cat_var_14             0
cat_var_15             0
cat_var_16             0
cat_var_17             0
cat_var_18             0
cat_var_19             0
cat_var_20             0
cat_var_21             0
cat_var_22             0
cat_var_23             0
cat_var_24             0
cat_var_25             0
cat_var_26             0
cat_var_27             0
cat_var_28             0
cat_var_29             0
cat_var_30             0
cat_var_31

In [26]:
Trans_ids = TestData['transaction_id']
Y = TrainData['target']

TrainData = TrainData.drop(['transaction_id', 'target'], axis=1)
TestData = TestData.drop(['transaction_id'], axis=1)

In [27]:
# Here making a list of all categorical variables
cat_variables = [x for x in TrainData.columns if 'cat_' in x]
len(cat_variables)

42

In [28]:
# Here creating a list of categorical variables which only have 1 unique value
# I will drop these columns as they will not contribute anything towards the model
# For Training Set
Drop_catVar_TrainData = []
for x in cat_variables:
    if TrainData[x].nunique() == 1:
        print(x, TrainData[x].nunique())
        Drop_catVar_TrainData.append(x)

cat_var_31 1
cat_var_35 1
cat_var_36 1
cat_var_37 1
cat_var_38 1
cat_var_40 1
cat_var_42 1


In [29]:
# For Test Set
Drop_catVar_TestData = []
for x in cat_variables:
    if TestData[x].nunique() == 1:
        print(x, TestData[x].nunique())
        Drop_catVar_TestData.append(x)

cat_var_38 1
cat_var_41 1
cat_var_42 1


In [30]:
# drop categorical features having only single value
Drop_catVar = list(set(Drop_catVar_TrainData + Drop_catVar_TestData))
TrainData = TrainData.drop(Drop_catVar, axis=1)
TestData = TestData.drop(Drop_catVar, axis=1)

print(TrainData.shape)
print(TestData.shape)

(348978, 41)
(523466, 41)


# Dropping All NaN Columns

In [31]:
TrainData = TrainData.drop(['cat_var_3', 'cat_var_8'], axis=1)
TestData = TestData.drop(['cat_var_3', 'cat_var_8'], axis=1)

TrainData = TrainData.drop(['cat_var_1','num_var_3'], axis=1)
TestData = TestData.drop(['cat_var_1','num_var_3'], axis=1)

TrainData = TrainData.drop(['cat_var_7'], axis=1)
TestData = TestData.drop(['cat_var_7'], axis=1)

TrainData = TrainData.drop(['cat_var_16','cat_var_6'], axis=1)
TestData = TestData.drop(['cat_var_16','cat_var_6'], axis=1)

In [32]:
cat_variables = [x for x in TrainData.columns if 'cat_' in x]
len(cat_variables)

28

In [33]:
# Now Checking For Any Missing Values
print("Training Dataset Missing Value Columns:")
print(TrainData.isnull().sum())
print("\n")
print("Test Dataset Missing Value Columns:")
print(TestData.isnull().sum())

Training Dataset Missing Value Columns:
num_var_1     0
num_var_2     0
num_var_4     0
num_var_5     0
num_var_6     0
num_var_7     0
cat_var_2     0
cat_var_4     0
cat_var_5     0
cat_var_9     0
cat_var_10    0
cat_var_11    0
cat_var_12    0
cat_var_13    0
cat_var_14    0
cat_var_15    0
cat_var_17    0
cat_var_18    0
cat_var_19    0
cat_var_20    0
cat_var_21    0
cat_var_22    0
cat_var_23    0
cat_var_24    0
cat_var_25    0
cat_var_26    0
cat_var_27    0
cat_var_28    0
cat_var_29    0
cat_var_30    0
cat_var_32    0
cat_var_33    0
cat_var_34    0
cat_var_39    0
dtype: int64


Test Dataset Missing Value Columns:
num_var_1     0
num_var_2     0
num_var_4     0
num_var_5     0
num_var_6     0
num_var_7     0
cat_var_2     0
cat_var_4     0
cat_var_5     0
cat_var_9     0
cat_var_10    0
cat_var_11    0
cat_var_12    0
cat_var_13    0
cat_var_14    0
cat_var_15    0
cat_var_17    0
cat_var_18    0
cat_var_19    0
cat_var_20    0
cat_var_21    0
cat_var_22    0
cat_var_23   

In [34]:
# Using Label Encoder
from sklearn.preprocessing import LabelEncoder
for x in cat_variables:
    TrainData[x] = TrainData[x].fillna('NaN')
    TestData[x] = TestData[x].fillna('NaN')
    encoder = LabelEncoder()
    encoder.fit(list(set(list(TrainData[x]) + list(TestData[x]))))
    TrainData[x] = encoder.transform(TrainData[x])
    TestData[x] = encoder.transform(TestData[x])

In [35]:
TrainData.shape

(348978, 34)

In [36]:
TR_DATA = TrainData.as_matrix()
TR_DATA.shape

(348978, 34)

# We are using XGBoost Classifier for our Model

In [55]:
eclf = xgb.XGBClassifier(n_estimators=370, max_depth = 5, seed=1729)
eclf.fit(TR_DATA, Y)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=370, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=1729, silent=True, subsample=1)

In [56]:
Y_pred = eclf.predict_proba(TR_DATA)
roc_auc_score(Y, Y_pred[:,1]) 

0.77611512354902457

In [57]:
Test_DATA = TestData.as_matrix()
Y_pred_test = eclf.predict_proba(Test_DATA)

In [58]:
subm = pd.DataFrame({'transaction_id': Trans_ids, 'target': Y_pred_test[:,1]})
subm = subm[['transaction_id','target']]    

filename='subm-fraud.csv'
subm.to_csv(filename, index=False)
FileLink(filename)  #leaderBoard Score - 0.73465