**Loading the dataset using Kaggle API**

**This code of Kaggle API can be run only on colab, so if you dont want to do it on colab, download the dataset**

In [18]:
# !pip install kaggle
# ! mkdir ~/.kaggle
# ! cp kaggle.json ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json
# !kaggle datasets download -d rupakroy/online-payments-fraud-detection-dataset

**Unzipping the dataset**

In [19]:
# !unzip online-payments-fraud-detection-dataset

**Importing modules**

In [20]:
#Import libraries:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import  metrics   #Additional scklearn functions
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn import model_selection 
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve
from imblearn.under_sampling import RandomUnderSampler
import plotly.express as px
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline

**Reading the dataset**

In [21]:
df = pd.read_csv("PS_20174392719_1491204439457_log.csv")

**Displaying top 5 values**

In [22]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


**Total number of samples in dataframe**

In [23]:
print("Total number of samples : ",df.shape[0])

Total number of samples :  6362620


**Number of duplicates and Number of null values**

In [24]:
print("Total number of duplicate samples : ",df[df.duplicated()].shape[0])
print("Total number of null samples : ",df.isna().sum().sum())

Total number of duplicate samples :  0
Total number of null samples :  0


**Fraud vs Non Fraud cases**

In [25]:
value_counts = df.isFraud.value_counts()

In [26]:
fig = px.bar(df, y=[value_counts[0],value_counts[1]],x=['Non Fraud','Fraud'], text_auto='.2s', title="Non Fraud vs Fraud")
fig.show()

**coorelation table**

In [27]:
fig = px.imshow(df.corr(),text_auto=True)
fig.show()

**Distribution of type of transaction used**

In [28]:
type_transaction = df["type"].value_counts()
transaction = type_transaction.index
quantity = type_transaction.values

# plotting pie chart
fig = px.pie(df,
            values = quantity,
            names = transaction, 
            hole = 0.2,
            title = "Distribution of Transaction Type")
fig.show()

In [29]:
corr = df.corr()
corr["isFraud"].sort_values(ascending=False)

isFraud           1.000000
amount            0.076688
isFlaggedFraud    0.044109
step              0.031578
oldbalanceOrg     0.010154
newbalanceDest    0.000535
oldbalanceDest   -0.005885
newbalanceOrig   -0.008148
Name: isFraud, dtype: float64

**Label encoding the object data**

In [30]:
df["type"] = df["type"].map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4,"DEBIT": 5})

**Undersampling**

Since the dataset we are using is highly imbalanced dataset, it is important to make the dataset balanced for obtaining good accuracy and avoid overfitting.

we have 2 methods, first is SMOTE oversampling which is used to make the minority samples of the dataset as equivalent to majority class, but is not a good choice as it might take lot of computation time to train a model.

Second, feasible and easy to train method is to undersample the majority data.

In [31]:
# define undersample strategy
undersample = RandomUnderSampler(sampling_strategy=0.7)
X = df.drop(['isFraud'],axis=1)
y = df.isFraud
X_under, y_under = undersample.fit_resample(X, y)

In [32]:
X_under['isFraud'] = y_under.values
df_under = X_under.copy()
df_under = df_under.sample(frac=1)

In [33]:
print("Total samples in the undersampled dataset ",df_under.shape[0])
print(df_under.isFraud.value_counts())

Total samples in the undersampled dataset  19945
0    11732
1     8213
Name: isFraud, dtype: int64


**Using stratified cross validation for model training to avoid chances of overfitting**

In [34]:
class config:
    NUM_FOLDS = 10
    SEED = 541
    TARGET = 'isFraud'
    JUNK = -1

def create_folds(data):    
    data["kfold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=config.NUM_FOLDS, shuffle=True, random_state=config.SEED)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data[config.TARGET].values)):
        data.loc[v_, 'kfold'] = f    
    
    return data

In [35]:
def retrieve_data(train_folds,fold):


  train_data = train_folds[train_folds['kfold'] != fold].reset_index(drop=True)
  valid_data = train_folds[train_folds['kfold'] == fold].reset_index(drop=True)

  x_train = train_data.drop(['kfold', config.TARGET], axis=1)
  cols = x_train.columns
  x_train = x_train.values
  y_train = train_data[config.TARGET].values
      
  x_valid = valid_data.drop(['kfold', config.TARGET], axis=1).values
  y_valid = valid_data[config.TARGET].values


  return x_train,x_valid,y_train,y_valid

In [36]:
train1 = df_under[["type", "amount",	"oldbalanceOrg",	"newbalanceOrig",	"oldbalanceDest"	,"newbalanceDest",	"isFraud"]]
train_folds = create_folds(train1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [37]:
max = 0
xgb_dummy = None
for fold in range(config.NUM_FOLDS):
  
  x_train,x_valid,y_train,y_valid = retrieve_data(train_folds,fold)
  xgb = XGBClassifier(booster =  'gbtree',
                        n_jobs = 4,
                        objective = 'binary:logistic',
                        silent = 0,
                      subsample = 1,
                      learning_rate = 0.01,
                      max_depth = 4,
                      min_child_weight = 2,
                      colsample_bytree = 1,
                      n_estimators= 200,
                       reg_lambda= 1)

  xgb.fit(x_train, y_train)


  y_pred_train = xgb.predict(x_train)  #np.where(yhat_train[:,1] >= best_thresh,1,0)
  y_pred_test = xgb.predict(x_valid) #np.where(yhat_test[:,1] >= best_thresh,1,0)
  
  if(accuracy_score(y_valid,y_pred_test) > max):
    max = accuracy_score(y_valid,y_pred_test)
    xgb_dummy = xgb


  print("Training accuracy : ",accuracy_score(y_train,y_pred_train))
  print("Testing accuracy : ",accuracy_score(y_valid,y_pred_test))
  print("F1_Score : ",f1_score(y_valid, y_pred_test))
  print("Precision score : ",precision_score(y_valid, y_pred_test))
  print("Recall score : ",recall_score(y_valid, y_pred_test))
  print(classification_report(y_valid, y_pred_test))
  print(confusion_matrix(y_valid, y_pred_test))

  print("\n\n-----------------------------------------------------------------------",end="\n\n")

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Training accuracy :  0.9862952646239554
Testing accuracy :  0.9844611528822055
F1_Score :  0.9806128830519075
Precision score :  0.964329643296433
Recall score :  0.9974554707379135
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1209
           1       0.96      1.00      0.98       786

    accuracy                           0.98      1995
   macro avg       0.98      0.99      0.98      1995
weighted avg       0.98      0.98      0.98      1995

[[1180   29]
 [   2  784]]


-----------------------------------------------------------------------

Parameters: { "silent" } might not be used.

  This could be a false alarm, 

In [38]:
xgb_dummy.save_model("model.h5")