In [1]:
import torch
from torch import nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score
from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.amp import autocast, GradScaler
from openml import datasets
import numpy as np
import time, math

In [2]:
def load_higgs(openml_id=44129, scaler=None):
    raw_dataset = datasets.get_dataset(openml_id, download_data=True)
    dataset = raw_dataset.get_data()
    scaler = scaler or StandardScaler()
    X = scaler.fit_transform(dataset[0].drop(columns=["target"]).astype(np.float32))
    y = dataset[0]["target"].astype(np.float32)  # Convert target to float32
    tensor_ds = TensorDataset(torch.tensor(X), torch.tensor(y))
    return tensor_ds, scaler

In [3]:
class ResidualBlock(nn.Module):
    def __init__(self, hidden_dim, dropout=0.2):
        super().__init__()
        self.block = nn.Sequential(
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.BatchNorm1d(hidden_dim),
        )

    def forward(self, x):
        return x + self.block(x)


class HiggsNet(nn.Module):
    """
    Deep Residual Network for Higgs Boson Classification.

    Args:
        input_dim (int): Number of input features.
        hidden_dim (int): Number of hidden units in each layer.
        num_layers (int): Number of residual blocks.
        dropout (float): Dropout probability.
    """

    def __init__(self, input_dim, hidden_dim=128, num_layers=4, dropout=0.2):
        super().__init__()
        layers = [nn.Linear(input_dim, hidden_dim), nn.GELU(), nn.Dropout(dropout)]
        for _ in range(num_layers):
            layers.append(ResidualBlock(hidden_dim, dropout=dropout))
        layers.append(
            nn.Linear(hidden_dim, 1)
        )  # Output layer for binary classification
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x).squeeze(-1)

In [23]:
def train_epoch(
    model,
    train_loader,
    optimizer,
    criterion,
    scaler,
    device,
    accumulation_steps=1,
    scheduler=None,
    use_amp=False,
):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad(set_to_none=True)

    for i, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)

        with autocast(device_type="mps", enabled=use_amp):
            logits = model(inputs)
            loss = (
                criterion(logits, targets) / accumulation_steps
            )  # Scale loss for accumulation

        (scaler.scale(loss) if use_amp else loss).backward()
        if (i + 1) % accumulation_steps == 0:
            if use_amp:
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            if scheduler:
                scheduler.step()

        total_loss += loss.item()

    return total_loss / len(train_loader)


@torch.no_grad()
def evaluate(model, loader, device, use_amp=False):
    model.eval()
    y_true, y_prob = [], []

    for xb, yb in loader:
        xb = xb.to(device, non_blocking=True)
        with autocast(device_type="mps", enabled=use_amp):
            logits = model(xb)

        y_true.append(yb)  # keep on CPU
        y_prob.append(torch.sigmoid(logits).cpu())

    y_true = torch.cat(y_true).numpy()
    y_prob = torch.cat(y_prob).numpy()
    return (roc_auc_score(y_true, y_prob), average_precision_score(y_true, y_prob))

In [None]:
def main():
    tensor_ds, scaler = load_higgs()
    generator1 = torch.Generator().manual_seed(42)
    train, test = random_split(tensor_ds, [0.8, 0.2], generator=generator1)
    train_loader = DataLoader(train, batch_size=8192, shuffle=True, num_workers=4)
    test_loader = DataLoader(test, batch_size=8192, shuffle=False, num_workers=4)
    data_batch, labels_batch = next(iter(train_loader))
    print(
        f"Data batch shape: {data_batch.shape}, Labels batch shape: {labels_batch.shape}"
    )
    print(
        f"Number of training samples: {len(train)}, Number of test samples: {len(test)}"
    )

    pos_weight = (len(train) - train.dataset.tensors[1].sum()) / train.dataset.tensors[
        1
    ].sum()
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    model = HiggsNet(
        input_dim=train.dataset.tensors[0].shape[1],
        hidden_dim=512,
        num_layers=6,
        dropout=0.2,
    ).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-2)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=1e-3,
        epochs=10,
        steps_per_epoch=len(train_loader),
        pct_start=0.1,
        anneal_strategy="linear",
    )
    scaler = GradScaler()
    best_auc, best_epoch = 0.0, -1
    for epoch in range(1, 10):
        t0 = time.time()
        train_loss = train_epoch(
            model,
            train_loader,
            optimizer,
            criterion,
            scaler,
            device,
            accumulation_steps=1,
            scheduler=scheduler,
            use_amp=True,
        )
        val_auc, val_ap = evaluate(model, test_loader, device, use_amp=True)
        t1 = time.time()
        dt = t1 - t0
        print(
            f"Epoch {epoch:2d} | "
            f"Train Loss: {train_loss:.4f} | "
            f"Val AUC: {val_auc:.4f} | "
            f"Val AP: {val_ap:.4f} | "
            f"Time: {dt:.2f}s"
        )
        if val_auc > best_auc:
            best_auc = val_auc
            best_epoch = epoch
        if epoch - best_epoch > 3:
            print("Early stopping...")
            break

    print(f"\nBest val AUC: {best_auc:.4f}  (epoch {best_epoch})")

In [None]:
main()