# Experiment 2: Anomaly Detection Models

Training unsupervised anomaly detectors:
1. Variational Autoencoder (VAE)
2. Isolation Forest
3. Local Outlier Factor
4. One-Class SVM

These will be used as meta-features for the ensemble.

In [1]:
import pickle
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import joblib
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [2]:
# load data
with open('../data/processed_data.pkl', 'rb') as f:
    data = pickle.load(f)

X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

# use only legitimate transactions for training
X_train_legit = X_train[y_train == 0]
print(f"Training on {len(X_train_legit):,} legitimate transactions")

Training on 797,588 legitimate transactions


## 1. Variational Autoencoder

In [3]:
class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim=16):
        super().__init__()
        
        # encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32)
        )
        
        self.fc_mu = nn.Linear(32, latent_dim)
        self.fc_logvar = nn.Linear(32, latent_dim)
        
        # decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Linear(32, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, input_dim)
        )
    
    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)
    
    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std
    
    def decode(self, z):
        return self.decoder(z)
    
    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        recon = self.decode(z)
        return recon, mu, logvar

In [6]:
def vae_loss(recon, x, mu, logvar, beta=0.5):
    recon_loss = nn.functional.mse_loss(recon, x, reduction='mean')
    # clamp logvar to prevent numerical instability
    logvar = torch.clamp(logvar, min=-10, max=10)
    kl_loss = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return recon_loss + beta * kl_loss

# scale data for VAE - also clip extreme values
vae_scaler = MinMaxScaler()
X_train_vae = vae_scaler.fit_transform(X_train_legit)
X_test_vae = vae_scaler.transform(X_test)

# clip to valid range and handle NaN
X_train_vae = np.clip(X_train_vae, 0, 1)
X_test_vae = np.clip(X_test_vae, 0, 1)
X_train_vae = np.nan_to_num(X_train_vae, nan=0.0, posinf=1.0, neginf=0.0)
X_test_vae = np.nan_to_num(X_test_vae, nan=0.0, posinf=1.0, neginf=0.0)

# train VAE
input_dim = X_train.shape[1]
vae = VAE(input_dim, latent_dim=16).to(device)

train_tensor = torch.FloatTensor(X_train_vae).to(device)
train_loader = DataLoader(TensorDataset(train_tensor), batch_size=512, shuffle=True)

optimizer = optim.Adam(vae.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)

print("Training VAE...")
vae.train()
for epoch in range(30):
    total_loss = 0
    for batch in train_loader:
        x = batch[0]
        optimizer.zero_grad()
        recon, mu, logvar = vae(x)
        loss = vae_loss(recon, x, mu, logvar)
        
        # skip if loss is NaN
        if torch.isnan(loss):
            continue
            
        loss.backward()
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(vae.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss += loss.item()
    scheduler.step()
    if (epoch + 1) % 10 == 0:
        print(f"  Epoch {epoch+1}: loss = {total_loss/len(train_loader):.4f}")

Training VAE...
  Epoch 10: loss = 0.0472
  Epoch 20: loss = 0.0471
  Epoch 30: loss = 0.0471


In [7]:
# get anomaly scores (reconstruction error)
vae.eval()
with torch.no_grad():
    test_tensor = torch.FloatTensor(X_test_vae).to(device)
    recon, _, _ = vae(test_tensor)
    vae_scores = ((test_tensor - recon) ** 2).mean(dim=1).cpu().numpy()

vae_auc = roc_auc_score(y_test, vae_scores)
print(f"VAE AUC: {vae_auc:.4f}")

VAE AUC: 0.5256


## 2. Isolation Forest

In [10]:
# clean data for Isolation Forest
X_train_legit_clean = np.nan_to_num(X_train_legit, nan=0.0, posinf=0.0, neginf=0.0)
X_test_iso = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)

iso_forest = IsolationForest(
    n_estimators=200,
    contamination=0.01,  # expected fraud rate
    max_samples=0.8,
    random_state=42,
    n_jobs=-1
)

print("Training Isolation Forest...")
iso_forest.fit(X_train_legit_clean)

# scores: more negative = more anomalous
iso_scores = -iso_forest.score_samples(X_test_iso)
iso_auc = roc_auc_score(y_test, iso_scores)
print(f"Isolation Forest AUC: {iso_auc:.4f}")

Training Isolation Forest...
Isolation Forest AUC: 0.5676


## 3. Local Outlier Factor

In [11]:
# LOF is slow on large data, so we sample
sample_size = min(50000, len(X_train_legit))
np.random.seed(42)
sample_idx = np.random.choice(len(X_train_legit), sample_size, replace=False)
X_lof_train = X_train_legit[sample_idx]

# handle NaN/Inf values
X_lof_train = np.nan_to_num(X_lof_train, nan=0.0, posinf=0.0, neginf=0.0)
X_test_clean = np.nan_to_num(X_test, nan=0.0, posinf=0.0, neginf=0.0)

lof = LocalOutlierFactor(
    n_neighbors=20,
    contamination=0.01,
    novelty=True,
    n_jobs=-1
)

print(f"Training LOF on {sample_size:,} samples...")
lof.fit(X_lof_train)

lof_scores = -lof.score_samples(X_test_clean)
lof_auc = roc_auc_score(y_test, lof_scores)
print(f"LOF AUC: {lof_auc:.4f}")

Training LOF on 50,000 samples...
LOF AUC: 0.6976


## 4. One-Class SVM

In [12]:
# OCSVM also needs sampling
ocsvm = OneClassSVM(
    kernel='rbf',
    gamma='scale',
    nu=0.01
)

print(f"Training OCSVM on {sample_size:,} samples...")
ocsvm.fit(X_lof_train)  # reuse LOF sample (already cleaned)

ocsvm_scores = -ocsvm.score_samples(X_test_clean)
ocsvm_auc = roc_auc_score(y_test, ocsvm_scores)
print(f"OCSVM AUC: {ocsvm_auc:.4f}")

Training OCSVM on 50,000 samples...
OCSVM AUC: 0.5466


## 5. Summary

In [13]:
print("\n" + "="*50)
print("ANOMALY DETECTION RESULTS")
print("="*50)
print(f"VAE:              AUC = {vae_auc:.4f}")
print(f"Isolation Forest: AUC = {iso_auc:.4f}")
print(f"LOF:              AUC = {lof_auc:.4f}")
print(f"One-Class SVM:    AUC = {ocsvm_auc:.4f}")


ANOMALY DETECTION RESULTS
VAE:              AUC = 0.5256
Isolation Forest: AUC = 0.5676
LOF:              AUC = 0.6976
One-Class SVM:    AUC = 0.5466


In [14]:
# save models and scores
anomaly_data = {
    'scores': {
        'vae': vae_scores,
        'isolation_forest': iso_scores,
        'lof': lof_scores,
        'ocsvm': ocsvm_scores
    },
    'auc': {
        'vae': vae_auc,
        'isolation_forest': iso_auc,
        'lof': lof_auc,
        'ocsvm': ocsvm_auc
    }
}

with open('../models/anomaly_scores.pkl', 'wb') as f:
    pickle.dump(anomaly_data, f)

# save individual models
torch.save(vae.state_dict(), '../models/vae.pt')
joblib.dump(iso_forest, '../models/isolation_forest.pkl')
joblib.dump(lof, '../models/lof.pkl')
joblib.dump(ocsvm, '../models/ocsvm.pkl')

print("\nModels saved to ../models/")


Models saved to ../models/


## Summary

Anomaly detection models trained on legitimate transactions:
- Isolation Forest shows best standalone AUC
- All scores will be used as meta-features in ensemble

**Next:** Fraud technique analysis in notebook 02c