In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plot
import tensorflow as tf

In [2]:
## data downloaded from https://www.kaggle.com/datasets/jainilcoder/online-payment-fraud-detection

fraud_data = pd.read_csv('data/onlinefraud.csv')

In [3]:
total_columns = len(fraud_data.columns)
column_names = fraud_data.columns


In [4]:
column_names

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [5]:
total_values = fraud_data.count()

In [6]:
total_values

step              6362620
type              6362620
amount            6362620
nameOrig          6362620
oldbalanceOrg     6362620
newbalanceOrig    6362620
nameDest          6362620
oldbalanceDest    6362620
newbalanceDest    6362620
isFraud           6362620
isFlaggedFraud    6362620
dtype: int64

In [7]:
fraud_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [8]:
fraud_data.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [9]:
## checking data types
column_data_types = fraud_data.dtypes
column_data_types

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [10]:
fraud_data.replace('CASH_OUT', 1, inplace=True)
fraud_data.replace('PAYMENT', 2, inplace=True)
fraud_data.replace('CASH_IN', 3, inplace=True)
fraud_data.replace('TRANSFER', 4, inplace=True)
fraud_data.replace('DEBIT', 5, inplace=True)

In [11]:
## changing object types to categorical

## type: cash_out and 
# fraud_data,type = fraud_data.type.map({"CASH_OUT": 1, "PAYMENT": 2, "CASH_IN": 3, "TRANSFER": 4, "DEBIT": 5})
fraud_data.type = pd.Categorical(fraud_data.type)
fraud_data.type.describe

<bound method NDFrame.describe of 0          2
1          2
2          4
3          1
4          2
          ..
6362615    1
6362616    4
6362617    1
6362618    4
6362619    1
Name: type, Length: 6362620, dtype: category
Categories (5, int64): [1, 2, 3, 4, 5]>

In [12]:
np.array(fraud_data)

array([[1, 2, 9839.64, ..., 0.0, 0, 0],
       [1, 2, 1864.28, ..., 0.0, 0, 0],
       [1, 4, 181.0, ..., 0.0, 1, 0],
       ...,
       [743, 1, 6311409.28, ..., 6379898.11, 1, 0],
       [743, 4, 850002.52, ..., 0.0, 1, 0],
       [743, 1, 850002.52, ..., 7360101.63, 1, 0]], dtype=object)

In [13]:
len(fraud_data.nameOrig.unique())

6353307

In [14]:
fraud_data[fraud_data.nameOrig == fraud_data.nameDest]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud


In [15]:
## deleting nameorin and namedest

fraud_data.drop(columns=['nameOrig', 'nameDest'], inplace=True)

In [16]:
## deleting isFlaggedFraud
fraud_data.drop(columns=['isFlaggedFraud'], inplace=True)

In [17]:
## X and y
y = fraud_data['isFraud']
X = fraud_data.drop(columns=['isFraud'])

In [18]:
X

Unnamed: 0,step,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
0,1,2,9839.64,170136.00,160296.36,0.00,0.00
1,1,2,1864.28,21249.00,19384.72,0.00,0.00
2,1,4,181.00,181.00,0.00,0.00,0.00
3,1,1,181.00,181.00,0.00,21182.00,0.00
4,1,2,11668.14,41554.00,29885.86,0.00,0.00
...,...,...,...,...,...,...,...
6362615,743,1,339682.13,339682.13,0.00,0.00,339682.13
6362616,743,4,6311409.28,6311409.28,0.00,0.00,0.00
6362617,743,1,6311409.28,6311409.28,0.00,68488.84,6379898.11
6362618,743,4,850002.52,850002.52,0.00,0.00,0.00


In [19]:
y

0          0
1          0
2          1
3          1
4          0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 6362620, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [21]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

In [22]:
predict = dt_model.predict(X_test)

In [23]:
dt_model.score(X_test, y_test)

0.9997280994307377

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)


In [26]:
rf_predict = rf_model.predict(X_test)

In [27]:
rf_model.score(X_test, y_test)

0.9997139543144177

In [28]:
import pickle

In [30]:
with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

with open('dt_model.pkl', 'wb') as file:
    pickle.dump(dt_model, file)

In [31]:
from sklearn.metrics import confusion_matrix

In [32]:
confusion_matrix(y_test, rf_predict)

array([[1906281,      26],
       [    520,    1959]])

In [34]:
confusion_matrix(y_test, predict)

array([[1906084,     223],
       [    296,    2183]])