# Introduction

## Data structure

This is a sample of 1 row with headers explanation:

1,PAYMENT,1060.31,C429214117,1089.0,28.69,M1591654462,0.0,0.0,0,0

* step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

* type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

* amount - amount of the transaction in local currency.

* nameOrig - customer who started the transaction

* oldbalanceOrg - initial balance before the transaction

* newbalanceOrig - new balance after the transaction

* nameDest - customer who is the recipient of the transaction

* oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

* newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

* isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

* isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

# Preparation


In [None]:
import numpy as np 
import sklearn
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
import seaborn as sns
%matplotlib inline 
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [None]:
transactions = pd.read_csv("AIML Dataset.csv")

In [None]:
print(f"Data shape: {transactions.shape}")

In [None]:
transactions.head()

# Data exploration

In [None]:
transactions.info()

In [None]:
transactions.describe()

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

In [None]:
missing_data(transactions)

In [None]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

In [None]:
unique_values(transactions)

# Visualization

In [None]:
def plot_count(df, feature, title='', size=2.5):
    f, ax = plt.subplots(1,1, figsize=(3*size,2*size))
    total = float(len(df))
    sns.countplot(df[feature],order = df[feature].value_counts().index, palette='Set3')
    plt.title(title)
    if(size > 2):
        plt.xticks(rotation=90, size=8)
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.4f}%'.format(100*height/total),
                ha="center") 
    plt.show()


In [None]:
plot_count(transactions, 'type', 'Distribution of type (count & percent)', size=2.5)

In [None]:
plot_count(transactions, 'isFraud', 'Distribution of `isFraud` (count & percent)', size=2.5)

In [None]:
plot_count(transactions, 'isFlaggedFraud', 'Distribution of `isFlaggedFraud` (count & percent)', size=2.5)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(8,6))
s = sns.boxplot( x="isFraud", y="step", hue="isFraud",data=transactions, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8,6))
s = sns.boxplot(x="isFlaggedFraud", y="step", hue="isFlaggedFraud",data=transactions, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(16,12))
s = sns.boxplot(ax = ax1, x="isFraud", y="step", hue="type",data=transactions, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="isFraud", y="step", hue="type",data=transactions, palette="PRGn",showfliers=False)
plt.show()

Hence we observe that frauds are being commited via 'CASH_OUT' and 'TRANSFER'

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(16,12))
s = sns.boxplot(ax = ax1, x="isFlaggedFraud", y="step", hue="type",data=transactions, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="isFlaggedFraud", y="step", hue="type",data=transactions, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(16,12))
s = sns.boxplot(ax = ax1, x="isFraud", y="amount", hue="type",data=transactions, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="isFraud", y="amount", hue="type",data=transactions, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(16,12))
s = sns.boxplot(ax = ax1, x="isFraud", y="oldbalanceOrg", hue="type",data=transactions, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="isFraud", y="oldbalanceOrg", hue="type",data=transactions, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(16,12))
s = sns.boxplot(ax = ax1, x="isFraud", y="newbalanceOrig", hue="type",data=transactions, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="isFraud", y="newbalanceOrig", hue="type",data=transactions, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(16,12))
s = sns.boxplot(ax = ax1, x="isFraud", y="oldbalanceDest", hue="type",data=transactions, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="isFraud", y="oldbalanceDest", hue="type",data=transactions, palette="PRGn",showfliers=False)
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(16,12))
s = sns.boxplot(ax = ax1, x="isFraud", y="newbalanceDest", hue="type",data=transactions, palette="PRGn",showfliers=True)
s = sns.boxplot(ax = ax2, x="isFraud", y="newbalanceDest", hue="type",data=transactions, palette="PRGn",showfliers=False)
plt.show()

In [None]:
sns.heatmap(transactions.corr(), cmap='jet')

# Model

From the data, we will use 70% for training and 30% for validation.

CatBoostClassifier (CatBoost) algorithm is used.


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

Prepare the features (X) and labels (y).

In [None]:
X = transactions.drop(['isFraud', 'isFlaggedFraud'], axis=1)
y = transactions.isFraud

Partition data

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)

Specify the categorical features. CatBoost is very efficient in treatment of categorical features - no special processing (Label encoding, One-Hot encoding, Binary encoding) needed.

In [None]:
categorical_features_indices = np.where(X.dtypes != np.float)[0]

In [None]:
clf = CatBoostClassifier(iterations=500,
                             learning_rate=0.006, #last: 0.002 
                             depth=12,
                             eval_metric='AUC',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 25, 
                             od_wait=25)


In [None]:
clf.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)

In [None]:
score = clf.score(X, y)
print(score)

Prediction for validation set.

In [None]:
preds = clf.predict(X_validation)

Confusion matrix.

true +, false +, yada yada

In [None]:
cm = pd.crosstab(y_validation.values, preds, rownames=['Actual'], colnames=['Predicted'])
fig, (ax1) = plt.subplots(ncols=1, figsize=(5,5))
sns.heatmap(cm, 
            xticklabels=['Not Fraud', 'Fraud'],
            yticklabels=['Not Fraud', 'Fraud'],
            annot=True,ax=ax1,
            linewidths=.2,linecolor="Darkblue", cmap="Blues")
plt.title('Confusion Matrix', fontsize=14)
plt.show()

Validation ROC-AUC:

ROC Curves summarize the trade-off between the true positive rate and false positive rate for a predictive model using different probability thresholds.

AUC is area under ROC curve.... The higher the AUC, the better the performance of the model at distinguishing between the positive and negative classes

In [None]:
print(f"ROC-AUC score: {roc_auc_score(y_validation.values, preds)}")

In [None]:
print(sklearn.metrics.accuracy_score(y_validation.values, preds)*100,"%")

In [None]:
print(sklearn.metrics.balanced_accuracy_score(y_validation.values, preds)*100, "%")

In [None]:
import joblib
 
# Save the model as a pickle in a file
joblib.dump(clf, 'fraudModelv1.pkl')