In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch as tc
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score

In [2]:
X_train = pd.read_csv("Data/Sample_Data/X_train.csv")
y_train = pd.read_csv("Data/Sample_Data/y_train.csv")

X_test = pd.read_csv("Data/Sample_Data/X_test.csv")
y_test = pd.read_csv("Data/Sample_Data/y_test.csv")

In [4]:
y_train = y_train['is_fraud']
y_test = y_test['is_fraud']

In [6]:
X_train = X_train.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1)
X_test = X_test.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1)

In [7]:
X_train.head()

Unnamed: 0,cc_num,merchant,category,amt,gender,state,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,age,trans_year,trans_month,trans_hour
0,4642894980163,187,6,-0.271845,1,12,51632,0.43569,-0.349709,-0.269998,0.589711,0.452309,-0.297518,33,2019,12,21
1,4809701904914,283,1,-0.195807,0,1,36869,-1.205172,0.374061,-0.096307,-0.751233,-1.0918,0.370697,39,2019,6,14
2,6011542681743618,482,7,0.56494,0,45,22810,0.053445,0.832086,-0.291305,1.19559,-0.135063,0.769216,51,2020,3,22
3,3546674063249004,464,3,-0.062695,1,43,76008,-1.150009,-0.536196,-0.249117,-1.223921,-1.109899,-0.572251,39,2019,4,8
4,346243940647414,41,6,-0.252196,1,35,43076,0.267894,0.568297,-0.264189,-1.132401,0.416799,0.522161,40,2019,4,22


In [8]:
len(X_train.columns)

17

In [9]:
X_train.select_dtypes(include=['number'])

Unnamed: 0,cc_num,merchant,category,amt,gender,state,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,age,trans_year,trans_month,trans_hour
0,4642894980163,187,6,-0.271845,1,12,51632,0.435690,-0.349709,-0.269998,0.589711,0.452309,-0.297518,33,2019,12,21
1,4809701904914,283,1,-0.195807,0,1,36869,-1.205172,0.374061,-0.096307,-0.751233,-1.091800,0.370697,39,2019,6,14
2,6011542681743618,482,7,0.564940,0,45,22810,0.053445,0.832086,-0.291305,1.195590,-0.135063,0.769216,51,2020,3,22
3,3546674063249004,464,3,-0.062695,1,43,76008,-1.150009,-0.536196,-0.249117,-1.223921,-1.109899,-0.572251,39,2019,4,8
4,346243940647414,41,6,-0.252196,1,35,43076,0.267894,0.568297,-0.264189,-1.132401,0.416799,0.522161,40,2019,4,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155596,3577578023716568,510,11,-0.341520,0,35,44412,0.503541,0.667475,-0.285407,-1.456277,0.413942,0.653028,40,2019,2,19
155597,30357372465631,643,7,0.642662,1,48,53926,1.011500,0.075189,-0.288772,1.114976,0.964387,0.118050,56,2020,3,23
155598,2610529083834453,106,0,0.027689,0,42,37138,-0.452346,0.262709,-0.220673,-0.830949,-0.364034,0.333486,29,2019,6,17
155599,3524574586339330,464,3,0.073349,0,9,32960,-2.148353,0.713946,0.055682,0.485134,-2.307775,0.754113,37,2019,12,8


In [10]:
class TransactionDataset(Dataset):
    def __init__(self, data, labels):
        self.data = tc.tensor(data.values, dtype=tc.float32)
        self.labels = tc.tensor(labels.values, dtype=tc.float32)
        self.length = len(data)
        
    def __getitem__(self, index):
        return self.data[index], self.labels[index]

    def __len__(self):
        return self.length

In [11]:
train_dataset = TransactionDataset(X_train, y_train)
test_dataset = TransactionDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [21]:
class MLPBinaryClassifier(nn.Module):
    def __init__(self, in_dim=4, hide_dim=64):
        super(MLPBinaryClassifier, self).__init__()
        self._features = in_dim
        self.weights_hide = tc.nn.Parameter(tc.randn((in_dim, hide_dim)))
        self.bias_hide = tc.nn.Parameter(tc.zeros((hide_dim, ), dtype=tc.float32) + 0.1)
        self.weights_clf = tc.nn.Parameter(tc.randn((hide_dim, 1)))
        self.bias_clf = tc.nn.Parameter(tc.zeros((1, ), dtype=tc.float32) + 0.1)

    def forward(self, x):
        # Check input dimensions
        assert len(x.shape) == 2 and x.shape[1] == self._features
        
        # Linear transformation on input to produce hidden states
        hidden_states = tc.mm(x, self.weights_hide) + self.bias_hide
        
        # Apply non-linear activation (ReLU) to hidden states
        act_hidden_states = tc.relu(hidden_states)
        
        # Linear transformation on activated hidden states to produce logits
        logits = tc.mm(act_hidden_states, self.weights_clf) + self.bias_clf
        
        # Apply sigmoid activation to logits for binary classification output
        return tc.sigmoid(logits)

in_dim = X_train.shape[-1]
model = MLPBinaryClassifier(in_dim = in_dim)

model_loss = nn.BCELoss() #torch.nn.functional.nll_loss(output[idx_train], labels[idx_train])
optimizer = optim.Adam(model.parameters(), lr = 0.005)

epochs = 100
for epoch in range(epochs):
    if epoch % 5 == 0:
        print(epoch)
    model.train()
    for batch, (X_batch, y_batch) in enumerate(train_loader):
        optimizer.zero_grad()
        y_pred = model(X_batch).squeeze()
        loss = model_loss(y_pred, y_batch)
        loss.backward()
        optimizer.step()

0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95


In [30]:
model.eval()
y_pred = []
y_true = []

with tc.no_grad():
    for X_batch, y_batch in test_loader:
        y_pred_batch = model(X_batch)
        y_pred.extend(y_pred_batch.round().detach().numpy())
        y_true.extend(y_batch.detach().numpy())
        
accuracy = accuracy_score(y_true, y_pred)
print(f"The accuracy of the model is: {round(accuracy, 4) * 100}")

The accuracy of the model is: 99.62


In [31]:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_true, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     66433
         1.0       0.00      0.00      0.00       253

    accuracy                           1.00     66686
   macro avg       0.50      0.50      0.50     66686
weighted avg       0.99      1.00      0.99     66686

Confusion Matrix:
[[66433     0]
 [  253     0]]


In [37]:
len(y_test[y_test == 1])

253

In [38]:
len(y_test[y_test == 0])

66433