In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve

import copy
import torch
from torch import tensor, nn, optim
from torch.utils.data import Dataset, DataLoader
from torchmetrics import Accuracy, Precision, Recall

---
## Set seeding for reproducibility

In [None]:
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

---
## Load data

In [3]:
data = pd.read_csv('../data/creditcard.csv')
data.drop('Time', axis=1, inplace=True)

In [4]:
train_df, temp_df = train_test_split(data, test_size=0.8, stratify=data['Class'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.2, stratify=temp_df['Class'], random_state=42)

---
## Create custom dataset class `FraudDataset`

In [5]:
class FraudDataset(Dataset):
    def __init__(self, DataFrame):
        super().__init__()
        df = DataFrame
        self.data = df.to_numpy().astype(np.float32)

    def __len__(self):
        return self.data.shape[0]
    
    def __input_size__(self):
        return self.data.shape[1] -1
    
    def __getitem__(self, idx):
        X = tensor(self.data[idx, :-1], dtype=torch.float32)
        y = tensor(self.data[idx, -1], dtype=torch.float32)
        return X, y

# Instatiate FraudDataset objects for each dataset 
train_set = FraudDataset(train_df)
val_set = FraudDataset(val_df)
test_set = FraudDataset(test_df)

---
## Create DataLoaders using `DataLoader` class

In [6]:
train_loader = DataLoader(train_set,batch_size=32,shuffle=True)
val_loader = DataLoader(val_set,batch_size=32,shuffle=True)
test_loader = DataLoader(test_set,batch_size=32,shuffle=True)

---
## Create NN architectue class `FraudNN`

#### Notes on NN architecture

- Larger layers help to capture complex patterns in the dataset.
- Consider using leaky_relu instead of ReLU activation on larger layers as allows a small gradient to flow even for negative values, which can lead to better convergence and prevent dying ReLU where neurons get 'stuck' during training.
    - Trialed this and leaky_relu was actually worse for this dataset.
    - ReLU gave reduced overfitting more than leaky_relu
- Incrementally reducing dropout after largers layer to smaller layers helps to reduce overfitting and maintain learning capactiy without excessive regularization.

In [25]:
class FraudNN(nn.Module):
    def __init__(self, input_size):
        super(FraudNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = nn.functional.relu(self.fc1(x))
        x = nn.functional.dropout(x, p=0.3)
        x = nn.functional.elu(self.fc2(x))
        x = nn.functional.dropout(x, p=0.2)
        x = nn.functional.relu(self.fc3(x))
        x = nn.functional.sigmoid(self.fc4(x))
        return x

input_size = train_set.__input_size__()
nnet = FraudNN(input_size=input_size)

---
## Create Early Stopping class `EarlyStopping`

In [8]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0.0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float('inf')
        self.counter = 0
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_loss, model):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.best_model_state = copy.deepcopy(model.state_dict())
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

---
## Define training and validation loop

In [26]:
# 1. Define Loss Function
criterion = nn.BCELoss()

# 2. Define Optimizer
optimizer = optim.Adam(nnet.parameters(), lr=0.001)

# 3. Define Early Stopping
early_stopper = EarlyStopping(patience=10, delta = 0.0001)

# 4. Define number of epochs
epochs = 1000

# 4. Define training and validation loops
for epoch in range(epochs):
    # Set model in train mode
    nnet.train()
    train_loss = 0
    
    for X, y in train_loader:
        optimizer.zero_grad()
        outputs = nnet(X)
        loss = criterion(outputs, y.view(-1, 1))
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    # 5. Set model in eval mode 
    nnet.eval()
    val_loss = 0

    with torch.no_grad():
        for X, y in val_loader:
            val_outputs = nnet(X)
            loss = criterion(val_outputs, y.view(-1, 1))
            val_loss += loss.item()
    val_loss /= len(val_loader)

    # Apply Early Stopping
    early_stopper(val_loss, nnet)

    # Progress Log
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')

    # Stop if early stopping condition met
    if early_stopper.early_stop:
        print(f'\nEarly stopping triggered at epoch {epoch+1}\nBest Validation Loss: {early_stopper.best_loss:.4f}')
        break

# Reload best model weights from early stopping
nnet.load_state_dict(early_stopper.best_model_state)
nnet.eval()

# 6. Optimize threshold for best F1 Score on validation set
probs = []
labels = []

with torch.no_grad():
    for X, y in val_loader:
        outputs = nnet(X)
        probs.extend(outputs.squeeze().cpu().numpy())
        labels.extend(y.cpu().numpy())

# Calculate precision-recall curve
precision_vals, recall_vals, thresholds = precision_recall_curve(labels, probs)
f1_scores = 2 * (precision_vals * recall_vals) / (precision_vals + recall_vals + 1e-8)
best_threshold = thresholds[f1_scores.argmax()]
print(f'Optimal threshold based on validation F1-Score: {best_threshold:.2f}')

Epoch [1/1000], Loss: 0.0420, Validation Loss: 0.0311
Epoch [2/1000], Loss: 0.0176, Validation Loss: 0.0166
Epoch [3/1000], Loss: 0.0108, Validation Loss: 0.0081
Epoch [4/1000], Loss: 0.0108, Validation Loss: 0.0111
Epoch [5/1000], Loss: 0.0092, Validation Loss: 0.0079
Epoch [6/1000], Loss: 0.0059, Validation Loss: 0.0083
Epoch [7/1000], Loss: 0.0060, Validation Loss: 0.0067
Epoch [8/1000], Loss: 0.0048, Validation Loss: 0.0060
Epoch [9/1000], Loss: 0.0041, Validation Loss: 0.0095
Epoch [10/1000], Loss: 0.0058, Validation Loss: 0.0129
Epoch [11/1000], Loss: 0.0044, Validation Loss: 0.0123
Epoch [12/1000], Loss: 0.0069, Validation Loss: 0.0119
Epoch [13/1000], Loss: 0.0035, Validation Loss: 0.0098
Epoch [14/1000], Loss: 0.0048, Validation Loss: 0.0108
Epoch [15/1000], Loss: 0.0038, Validation Loss: 0.0111
Epoch [16/1000], Loss: 0.0052, Validation Loss: 0.0123
Epoch [17/1000], Loss: 0.0037, Validation Loss: 0.0099
Epoch [18/1000], Loss: 0.0078, Validation Loss: 0.0163

Early stopping tri

---
## Define test loop

In [28]:
# 1. Initialize metrics
acc = Accuracy(task='binary')
precision = Precision(task='binary')
recall = Recall(task='binary')

# Model already in evalute mode but a failsafe
nnet.eval()

# 2. Evaluate model on test set
with torch.no_grad():
    for X, y in test_loader:
        outputs = nnet(X).detach() # Using .detach() should be more memory efficient
        preds = (outputs >= best_threshold).float() # Apply the tuned threshold
        acc.update(preds, y.view(-1,1))
        precision.update(preds, y.view(-1,1))
        recall.update(preds, y.view(-1,1))
  
# 3. Calculate and show metrics for test set
test_accuracy = acc.compute()
test_precision = precision.compute()
test_recall = recall.compute()
print(f'Test Accuracy: {test_accuracy:.4f}, Test Precision: {test_precision:.4f}, Test Recall: {test_recall:.4f}')

Test Accuracy: 0.9993, Test Precision: 0.8194, Test Recall: 0.7468


---


---
## Overfitting Notes

## fighting overfitting
- dropout=0.2
- learning_rate=1e-3
- weight_decay=1e-4
- weight decay takes values between 0 and 1.
- typically small values like 1e-3.
- adds penalty to loss function to discourage large weights and biases.
- proportional to the current value of the weight and subtracted from the gradient.
- higher the value of the parameter, the less likely the model is to overfit.

## Data Augmentation

## Maximizing performance
- overfit the training set
- reduce overfitting
- fine-tune hyperparameters

### 1. Overfitting Training set
- modify training loop to overfit a single data point (batch size = 1)
```Python
features, labels = next(iter(trainloader))
for i in range(1e-3):
    outputs = model(features)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
```
- Should give an accuracy of 1.0 and a loss of 0
- Helps finding bus in code
- ***goal***: minimize training loss
- create large enough model
- hyperparameters kept to defaults for now

### 2. Reduce Overfitting
- ***Goal***: Maximize the validation accuracy
- experiment with:
    - Dropout
    - Data augmentation
    - weight decay
    - Reducing model capacity
- keep track of each hyperparameter set and corresponding accuracy / metric.
- plot each experiment against the default setting train/val curves

### 3. Fine-Tuning Hyperparameters
- Grid Search:
    - Usually done on the optimizer hyperparameters
    - Uses values of the parameters at a constant inverval
    - Eg. Every momentum value between 0.85 and 0.99 with a constant interval
    - 
```Python
for factor in range(2,6):
    lr = 10**-factor
for val in np.arange(0.85, 1.00, 0.01):
    momentum = val
```
- Random Search:
    - Randomly samples parameters between intervals.
    - Quicker, and possibly better results as searches a less restricted sapce
```Python
factor = np.random.uniform(2,6)
lr = 10**-factor
val = np.random.uniform(0.85, 1.00)
momentum = val
```