# TODOs

- [ ] split even the transactions to train/test/validation with the customer ID in mind
- [ ] add Latitude/Longitude
- [ ] remove the 'equals' merchant and customer state/city variable?
- [ ] train the model on AIC and calculate results on the full dataset

# Intro

- package loading
- data loading
- data preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
from tqdm import tqdm

import seaborn as sns

def plotsize(w, h):
    plt.figure(figsize=(w, h))

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np

np.random.seed(42)
torch.manual_seed(42)


<torch._C.Generator at 0x1450fb8b0>

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
from src.process_data import IBMDataset
df = IBMDataset().ibm_credit_card()

In [5]:
df.head()

Unnamed: 0,customer.id,card.id,amount_signed,timestamp,date,merchant.name,merchant.city,merchant.state,is_fraud,age,chip,gender,customer.city,customer.state,score,num_cards,total_debt,credit_limit,card.brand,latitude,longitude,direction,amount_usd,log_amount
0,0,4344676511950444,134.09,1030861260,2002-09-01 06:21:00,3527213246127876953,La Verne,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,134.09,4.905941
1,0,4344676511950444,38.48,1030862520,2002-09-01 06:42:00,-727612092139916043,Monterey Park,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,38.48,3.675794
2,0,4344676511950444,120.34,1030947720,2002-09-02 06:22:00,-727612092139916043,Monterey Park,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,120.34,4.798597
3,0,4344676511950444,128.95,1030988700,2002-09-02 17:45:00,3414527459579106770,Monterey Park,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,128.95,4.86715
4,0,4344676511950444,104.71,1031034180,2002-09-03 06:23:00,5817218446178736267,La Verne,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,104.71,4.660699


In [6]:
df.is_fraud.value_counts()

is_fraud
No     24357143
Yes       29757
Name: count, dtype: int64

In [7]:
df['merchant.state'].nunique()

223

In [8]:
df['customer.state'].nunique()

51

In [9]:
df[['merchant.state', 'is_fraud']].value_counts()

merchant.state  is_fraud
CA              No          2591079
TX              No          1792993
FL              No          1458385
NY              No          1446624
OH              No           895092
                             ...   
Tonga           No                2
Togo            No                2
Paraguay        No                1
Botswana        No                1
Kiribati        No                1
Name: count, Length: 282, dtype: int64

## Feature engineering

Fixed numerical features:
- amount features: amount, yearly income, total debt, credit limit (in log scale)
- num credit cards
- FICO score

Time features:
- time from last transaction
- time of transaction
- mean time between previous transactions

Categorical features:
- gender
- card brand
- card type
- chip

Binary features:
- previously seen merchant ID
- has chip (card)
- state == merchant state

In [10]:
df.sort_values('date', inplace=True)

In [11]:
gdf = df.groupby('customer.id')

In [49]:
new_groups = []

for name, group in tqdm(gdf):
    # figure out known merchants
    known_merchants = group['merchant.name'].duplicated().astype(int)
    
    # create a copy of the group
    g_new = group.copy()
    g_new['is_known_merchant'] = known_merchants
    
    # time difference for the customer
    # g_new['log_timediff'] = np.log(1 + g_new['date'].diff().dt.seconds.fillna(0, inplace=True))
    g_new['log_timediff'] = np.log(1 + g_new['date'].diff().dt.seconds).fillna(0)
    
    # customer address and merchant address
    g_new['same_city'] = (group['merchant.city'] == group['customer.city'])
    g_new['same_state'] = (group['merchant.state'] == group['customer.state'])
    
    new_groups.append(g_new)


100%|██████████| 2000/2000 [00:34<00:00, 58.82it/s] 


In [50]:
g = new_groups[0]
g

Unnamed: 0,customer.id,card.id,amount_signed,timestamp,date,merchant.name,merchant.city,merchant.state,is_fraud,age,chip,gender,customer.city,customer.state,score,num_cards,total_debt,credit_limit,card.brand,latitude,longitude,direction,amount_usd,log_amount,is_known_merchant,log_timediff,same_city,same_state
0,0,4344676511950444,134.09,1030861260,2002-09-01 06:21:00,3527213246127876953,La Verne,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,134.09,4.905941,0,0.000000,True,True
1,0,4344676511950444,38.48,1030862520,2002-09-01 06:42:00,-727612092139916043,Monterey Park,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,38.48,3.675794,0,7.139660,False,True
2,0,4344676511950444,120.34,1030947720,2002-09-02 06:22:00,-727612092139916043,Monterey Park,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,120.34,4.798597,1,11.352768,False,True
3,0,4344676511950444,128.95,1030988700,2002-09-02 17:45:00,3414527459579106770,Monterey Park,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,128.95,4.867150,0,10.620864,False,True
4,0,4344676511950444,104.71,1031034180,2002-09-03 06:23:00,5817218446178736267,La Verne,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,24295.0,Visa,34.15,-117.76,outbound,104.71,4.660699,0,10.725050,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19934,0,4879494103069057,-295.00,1582816920,2020-02-27 15:22:00,7834055923142137930,New York,NY,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,12400.0,Visa,34.15,-117.76,inbound,295.00,5.690359,1,5.707110,False,False
10544,0,4582313478255491,46.77,1582870980,2020-02-28 06:23:00,-727612092139916043,Monterey Park,CA,No,53,Chip Transaction,Female,La Verne,CA,787,5,127613.0,46414.0,Visa,34.15,-117.76,outbound,46.77,3.866398,1,10.897868,False,True
19935,0,4879494103069057,114.51,1582871340,2020-02-28 06:29:00,-34551508091458520,La Verne,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,12400.0,Visa,34.15,-117.76,outbound,114.51,4.749357,1,5.888878,True,True
19936,0,4879494103069057,34.11,1582872780,2020-02-28 06:53:00,-34551508091458520,La Verne,CA,No,53,Swipe Transaction,Female,La Verne,CA,787,5,127613.0,12400.0,Visa,34.15,-117.76,outbound,34.11,3.558486,1,7.273093,True,True


In [51]:
g['card.brand'].value_counts()

card.brand
Visa          19937
Mastercard       26
Name: count, dtype: int64

In [52]:
data = pd.concat(objs=new_groups)

In [53]:
data['merchant.state'].fillna('None', inplace=True)

In [54]:
data['merchant.state'].isna().mean()

0.0

In [55]:
len(data[data['is_fraud'] == 'Yes'])

29757

In [56]:
data['merchant.state'].value_counts()

merchant.state
None        2720821
CA          2591830
TX          1793298
FL          1458699
NY          1446864
             ...   
Tonga             2
Togo              2
Kiribati          1
Paraguay          1
Botswana          1
Name: count, Length: 224, dtype: int64

In [57]:
from sklearn.preprocessing import OneHotEncoder

oh_customer_state = OneHotEncoder(min_frequency=1, sparse_output=False)
oh_merchant_state = OneHotEncoder(min_frequency=5, sparse_output=False)

oh_customer_state.fit(np.array(data['customer.state']).reshape(-1,1))
oh_merchant_state.fit(np.array(data['merchant.state']).reshape(-1,1))

In [58]:
class TransactionDataset(Dataset):
    def __init__(self, df, oh_c, oh_m):
        self.dataframe = df
        self.oh_customer_state = oh_c
        self.oh_merchant_state = oh_m

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        item = self.process_row(idx)
        return item[:, :-1], item[:, -1]
    
    def process_row(self, idx):
        g = self.dataframe.iloc[idx]
        
        direction = np.array(g['direction'] == 'inbound', dtype=float)
        brand = np.array(g['card.brand'] == 'Visa', dtype=float)
        gender = np.array(g['gender'] == 'female', dtype=float)
        same_city = np.array(g['same_city'], dtype=float)
        same_state = np.array(g['same_state'], dtype=float)
        known_merchant = np.array(g['is_known_merchant'], dtype=float)
        label = np.array(g['is_fraud'] == 'Yes', dtype=float)

        debt = np.array(np.log(1 + g['total_debt']), dtype=float)
        limit = np.array(np.log(1 + g['credit_limit']), dtype=float)
        
        customer_state = self.oh_customer_state.transform(np.array(g['customer.state']).reshape(-1,1))
        merchant_state = self.oh_merchant_state.transform(np.array(g['merchant.state']).reshape(-1,1))

        Fnum = np.array(g[['age', 'num_cards', 'log_amount', 'log_timediff']])
        if len(Fnum.shape) == 1:
            Fnum = Fnum.reshape(-1,1).transpose()
            
        Fcat = np.vstack((debt, limit, direction, brand, gender, same_city, same_state, known_merchant, label)).transpose()

        # print(customer_state.shape)
        # print(merchant_state.shape)
        # print(Fnum.shape)
        # print(Fcat.shape)
        F = np.hstack((customer_state, merchant_state, Fnum, Fcat))
        # print('Feature matrix created with shape', F.shape)
        
        return np.array(F, dtype=float)

In [59]:
class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        x, y = self.dataset[idx]
        x = torch.tensor(x, dtype=torch.float32).squeeze(0)
        y = torch.tensor(y[0], dtype=torch.float32)
        return x, y

In [60]:
import random

class CustomUpsampleDataset(Dataset):
    def __init__(self, dataset, pidx, nidx):
        self.dataset = dataset
        self.positive_idx = pidx
        self.negative_idx = nidx

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # index is randomized, therefore indexing does not return the index itself!
        if random.random() > 0.5:
            idx = random.sample(self.positive_idx, 1)[0]
        else:
            idx = random.sample(self.negative_idx, 1)[0]
        
        x, y = self.dataset[idx]
        x = torch.tensor(x, dtype=torch.float32).squeeze(0)
        y = torch.tensor(y[0], dtype=torch.float32)
        return x, y

## PyTorch data preparation

In [25]:
# Individual split indexes (iid rows)

from sklearn.model_selection import train_test_split
index_list = list(range(len(g)))
tmp, test_ix = train_test_split(index_list, test_size=0.2, random_state=31)
train_ix, val_ix = train_test_split(tmp, test_size=0.3, random_state=31)

print(len(train_ix))
print(len(val_ix))
print(len(test_ix))

13656664
5852856
4877380


In [61]:
# Split indexes by customers

from sklearn.model_selection import train_test_split

grouped_df = data.groupby('customer.id')
groups = np.array(list(grouped_df.groups))

index_list = list(range(len(groups)))
tmp, _test_ix = train_test_split(index_list, test_size=0.2, random_state=31)
_train_ix, _val_ix = train_test_split(tmp, test_size=0.3, random_state=31)

print(len(_train_ix))
print(len(_val_ix))
print(len(_test_ix))

train_ix = []
val_ix = []
test_ix = []

for gix in groups[_train_ix]:
    g = grouped_df.get_group(gix)
    indexes = g.index
    train_ix.extend(indexes)

for gix in groups[_val_ix]:
    g = grouped_df.get_group(gix)
    indexes = g.index
    val_ix.extend(indexes)

for gix in groups[_test_ix]:
    g = grouped_df.get_group(gix)
    indexes = g.index
    test_ix.extend(indexes)

print(len(train_ix))
print(len(val_ix))
print(len(test_ix))

1120
480
400
13426335
5771380
5189185


In [64]:
g_train = data.iloc[train_ix].reset_index()
pos_idx = list(np.array(range(len(g_train)))[g_train['is_fraud'] == 'Yes'])
neg_idx = list(np.array(range(len(g_train)))[g_train['is_fraud'] == 'No'])

In [65]:
train_dataset = CustomUpsampleDataset(
    TransactionDataset(g_train, oh_customer_state, oh_merchant_state),
    pos_idx, neg_idx
)
val_dataset = CustomDataset(TransactionDataset(data.iloc[val_ix], oh_customer_state, oh_merchant_state))
test_dataset = CustomDataset(TransactionDataset(data.iloc[test_ix], oh_customer_state, oh_merchant_state))

In [66]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, output_size)
        )
        
    def forward(self, x):
        return self.layers(x)


In [67]:
train_dataset[1][0].numpy().shape[0]

277

In [71]:
input_size = train_dataset[1][0].numpy().shape[0]
hidden_size = 256
output_size = 1
learning_rate = 0.001
batchsize = 128

train_loader = DataLoader(train_dataset, batch_size=batchsize, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batchsize, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batchsize, shuffle=False)


In [72]:
model = MLP(input_size, hidden_size, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [73]:
from sklearn.metrics import f1_score

def binary_accuracy(y_pred, y_true):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = (y_pred_tag == y_true).float().sum().item()
    acc = correct_results_sum/y_true.shape[0]
    acc = round(acc * 100)
    
    return acc, y_pred_tag

def evaluate(model, dataloader, criterion, break_at=1_000):
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    all_preds = []
    all_labels = []
    
    imax = 0
    with torch.no_grad():
        for inputs, labels in dataloader:
            predictions = model(inputs).squeeze()
            loss = criterion(predictions, labels)
            
            acc, preds = binary_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc

            all_preds.extend(preds.detach().numpy())
            all_labels.extend(labels.detach().numpy())
            
            imax += 1
            if imax > break_at:
                break
            
    f1 = f1_score(all_labels, all_preds, average='binary')
    return epoch_loss / imax, epoch_acc / imax, f1

In [74]:
from sklearn.metrics import confusion_matrix

def get_predictions(model, dataloader, *, break_at = float('inf'), show_progress = False):
    model.eval()
    all_preds = []
    all_labels = []
    imax = 0
    with torch.no_grad():
        if show_progress:
            for inputs, labels in tqdm(dataloader):
                predictions = model(inputs).squeeze()
                preds = torch.round(torch.sigmoid(predictions))
                all_preds.extend(preds.detach().numpy())
                all_labels.extend(labels.detach().numpy())
                imax += 1
                if imax > break_at:
                    break
            
        else:
            for inputs, labels in dataloader:
                predictions = model(inputs).squeeze()
                preds = torch.round(torch.sigmoid(predictions))
                all_preds.extend(preds.detach().numpy())
                all_labels.extend(labels.detach().numpy())
                imax += 1
                if imax > break_at:
                    break
            
    return all_labels, all_preds

def color_confusion_matrix(val):
    if 'TN' in val or 'TP' in val:
        color = 'green'
    else:
        color = 'red'
    return 'color: %s' % color

In [85]:
num_epochs = 10
num_batches = 1000

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    counter = 0

    for inputs, labels in train_loader:
        optimizer.zero_grad()
        predictions = model(inputs).squeeze()
        loss = criterion(predictions, labels)

        acc, _ = binary_accuracy(predictions, labels)

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc
        
        counter += 1
        if counter > num_batches:
            break

    # Calculate average loss and accuracy
    # epoch_loss /= len(train_loader)
    # epoch_acc /= len(train_loader)
    epoch_loss /= num_batches
    epoch_acc /= num_batches

    # Evaluate on validation set
    # val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion)

    print(f'Epoch: {epoch+1}')
    print(f'Train Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc:.2f}%')
    # print(f'Val. Loss: {val_loss:.3f} | Val. Acc: {val_acc:.2f}% | Val. F1: {val_f1:.3f}')
    
    # Get the training confusion matrix
    val_labels, val_preds = get_predictions(model, val_loader, break_at=2_000)
    cm = confusion_matrix(val_labels, val_preds)
    
    pr = cm[1,1] / (cm[1,1] + cm[0,1])
    rc = cm[1,1] / (cm[1,0] + cm[1,1])
    f1 = 2 * pr * rc / (pr + rc)
    acc = (cm[0,0] + cm[1,1]) / np.sum(cm)
    print(f"F1 score: {round(f1, 4)}")
    print(f"Accuracy: {round(acc, 4)}")
    
    # Assuming you already have the confusion matrix cm
    cm_labeled = pd.DataFrame({
        'Predicted Negative': {'Actual Negative': f'TN: {cm[0,0]}', 'Actual Positive': f'FN: {cm[1,0]}'},
        'Predicted Positive': {'Actual Negative': f'FP: {cm[0,1]}', 'Actual Positive': f'TP: {cm[1,1]}'},
    })

    cm_labeled = cm_labeled.style.applymap(color_confusion_matrix)
    display(cm_labeled)

# Evaluate on test set
# test_loss, test_acc, test_f1 = evaluate(model, test_loader, criterion)
# print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.2f}% | Test F1: {test_f1:.3f}')


Epoch: 1
Train Loss: 0.125 | Train Acc: 95.53%
F1 score: 0.016
Accuracy: 0.8469


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 216598,FP: 39173
Actual Positive,FN: 39,TP: 318


Epoch: 2
Train Loss: 0.123 | Train Acc: 95.64%
F1 score: 0.0184
Accuracy: 0.868


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 221992,FP: 33779
Actual Positive,FN: 40,TP: 317


Epoch: 3
Train Loss: 0.122 | Train Acc: 95.68%
F1 score: 0.016
Accuracy: 0.8438


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 215798,FP: 39973
Actual Positive,FN: 31,TP: 326


Epoch: 4
Train Loss: 0.120 | Train Acc: 95.73%
F1 score: 0.0184
Accuracy: 0.8672


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 221793,FP: 33978
Actual Positive,FN: 39,TP: 318


Epoch: 5
Train Loss: 0.119 | Train Acc: 95.81%
F1 score: 0.0209
Accuracy: 0.8852


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 226411,FP: 29360
Actual Positive,FN: 43,TP: 314


Epoch: 6
Train Loss: 0.116 | Train Acc: 95.91%
F1 score: 0.0155
Accuracy: 0.8394


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 214674,FP: 41097
Actual Positive,FN: 33,TP: 324


Epoch: 7
Train Loss: 0.117 | Train Acc: 95.83%
F1 score: 0.0184
Accuracy: 0.8681


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 222020,FP: 33751
Actual Positive,FN: 40,TP: 317


Epoch: 8
Train Loss: 0.116 | Train Acc: 95.92%
F1 score: 0.0124
Accuracy: 0.7991


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 204338,FP: 51433
Actual Positive,FN: 33,TP: 324


Epoch: 9
Train Loss: 0.113 | Train Acc: 96.09%
F1 score: 0.0187
Accuracy: 0.8688


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 222198,FP: 33573
Actual Positive,FN: 37,TP: 320


Epoch: 10
Train Loss: 0.115 | Train Acc: 95.92%
F1 score: 0.0144
Accuracy: 0.8273


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 211580,FP: 44191
Actual Positive,FN: 34,TP: 323


In [76]:
val_loss, val_acc, val_f1 = evaluate(model, val_loader, criterion, break_at=1_000)

In [81]:
# Get the training confusion matrix
val_labels, val_preds = get_predictions(model, val_loader, break_at = 10_000, show_progress=True)
cm = confusion_matrix(val_labels, val_preds)

# Assuming you already have the confusion matrix cm
cm_labeled = pd.DataFrame({
    'Predicted Negative': {'Actual Negative': f'TN: {cm[0,0]}', 'Actual Positive': f'FN: {cm[1,0]}'},
    'Predicted Positive': {'Actual Negative': f'FP: {cm[0,1]}', 'Actual Positive': f'TP: {cm[1,1]}'},
})

cm_labeled = cm_labeled.style.applymap(color_confusion_matrix)
display(cm_labeled)

 22%|██▏       | 10000/45089 [10:26<36:37, 15.97it/s]


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,TN: 1176172,FP: 102319
Actual Positive,FN: 197,TP: 1440


In [82]:
cm

array([[1176172,  102319],
       [    197,    1440]])

In [83]:
pr = cm[1,1] / (cm[1,1] + cm[0,1])
rc = cm[1,1] / (cm[1,0] + cm[1,1])
f1 = 2 * pr * rc / (pr + rc)
acc = (cm[0,0] + cm[1,1]) / np.sum(cm)
print(f"F1 score: {round(f1, 4)}")
print(f"Accuracy: {round(acc, 4)}")

F1 score: 0.0273
Accuracy: 0.9199
