# Tabular XGBoost Baseline
Tune the configuration in Cell 2, then run the remaining cells in order to train and evaluate the XGBoost model.

In [2]:
!uv pip install xgboost

[2K[37m⠸[0m [2m                                                                              [0m[37m⠋[0m [2mResolving dependencies...                                                     [0m[2mResolved [1m4 packages[0m [2min 598ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2mxgboost             [0m [32m[30m[2m------------------------------[0m[0m     0 B/110.52 MiB          [2mResolved [1m4 packages[0m [2min 598ms[0m[0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/1)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)-------------------[0m[0m     0 B/110.52 MiB          [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)-------------------[0m[0m 16.00 KiB/110.52 MiB        [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/1)-------------------[0m[0m 32.00 KiB/110.52 MiB        [1A
[2K[1A[37m⠙[0m [2mPreparing pack

In [3]:
from __future__ import annotations

import json
from pathlib import Path
from typing import Any, Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import auc, precision_recall_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

try:
    import xgboost as xgb
except ImportError as exc:
    raise ImportError("XGBoost is required. Install via `pip install xgboost`.") from exc

CONFIG: Dict[str, Any] = {
    'dataset': 'dataset/HI-Small_Trans.csv',
    'test_size': 0.2,
    'val_size': 0.1,
    'random_state': 42,
    'max_samples': None,
    'target_fpr': 0.05,
    'report': None,
    'n_estimators': 400,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.0,
    'reg_lambda': 1.0,
}

In [4]:
def load_transactions(path: Path, max_samples: int | None) -> pd.DataFrame:
    df = pd.read_csv(path)
    if max_samples is not None and len(df) > max_samples:
        df = df.sample(max_samples, random_state=42).reset_index(drop=True)
    df = df.rename(
        columns={
            'From Bank': 'from_bank',
            'To Bank': 'to_bank',
            'Amount Received': 'amount_received',
            'Receiving Currency': 'receiving_currency',
            'Amount Paid': 'amount_paid',
            'Payment Currency': 'payment_currency',
            'Payment Format': 'payment_format',
            'Is Laundering': 'is_laundering',
        }
    )
    if 'Account' in df.columns:
        df = df.rename(columns={'Account': 'from_account'})
    if 'Account.1' in df.columns:
        df = df.rename(columns={'Account.1': 'to_account'})
    return df


def engineer_features(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
    work = df.copy()
    work['timestamp'] = pd.to_datetime(work['Timestamp'], errors='coerce')
    work['hour'] = work['timestamp'].dt.hour.fillna(-1).astype(int)
    work['dayofweek'] = work['timestamp'].dt.dayofweek.fillna(-1).astype(int)
    work['month'] = work['timestamp'].dt.month.fillna(-1).astype(int)
    work['is_weekend'] = (work['dayofweek'] >= 5).astype(int)
    work['same_bank'] = (work['from_bank'] == work['to_bank']).astype(int)
    if 'from_account' in work.columns and 'to_account' in work.columns:
        work['same_account'] = (work['from_account'] == work['to_account']).astype(int)
    else:
        work['same_account'] = 0
    work['amount_diff'] = work['amount_received'] - work['amount_paid']
    work['amount_ratio'] = np.divide(
        work['amount_received'],
        work['amount_paid'],
        out=np.full(work.shape[0], np.nan, dtype=float),
        where=work['amount_paid'].abs() > 0,
    )
    work['amount_ratio'] = np.where(np.isfinite(work['amount_ratio']), work['amount_ratio'], np.nan)
    work['is_round_amount'] = ((work['amount_paid'] % 100) == 0).astype(int)

    feature_cols = [
        'amount_received',
        'amount_paid',
        'amount_diff',
        'amount_ratio',
        'hour',
        'dayofweek',
        'month',
        'is_weekend',
        'same_bank',
        'same_account',
        'is_round_amount',
        'from_bank',
        'to_bank',
        'receiving_currency',
        'payment_currency',
        'payment_format',
    ]
    return work[feature_cols], work['is_laundering'].astype(int)


def split_data(
    X: pd.DataFrame,
    y: pd.Series,
    test_size: float,
    val_size: float,
    random_state: int,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Series]:
    X_tmp, X_test, y_tmp, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        stratify=y,
        random_state=random_state,
    )
    relative_val = val_size / (1.0 - test_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_tmp,
        y_tmp,
        test_size=relative_val,
        stratify=y_tmp,
        random_state=random_state + 1,
    )
    return X_train, X_val, X_test, y_train, y_val, y_test

In [5]:
def prepare_numeric(
    X_train: pd.DataFrame,
    X_val: pd.DataFrame,
    X_test: pd.DataFrame,
    numeric_cols: List[str],
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, StandardScaler]:
    scaler = StandardScaler()
    train_numeric = X_train[numeric_cols].copy()
    medians = train_numeric.median()
    train_numeric = train_numeric.fillna(medians)
    scaler.fit(train_numeric)

    def transform(df: pd.DataFrame) -> np.ndarray:
        filled = df[numeric_cols].fillna(medians)
        scaled = scaler.transform(filled)
        return scaled.astype(np.float32)

    train_array = transform(X_train)
    val_array = transform(X_val)
    test_array = transform(X_test)
    return train_array, val_array, test_array, scaler


def encode_categorical(
    X_train: pd.DataFrame,
    X_val: pd.DataFrame,
    X_test: pd.DataFrame,
    categorical_cols: List[str],
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, Dict[str, Dict[Any, int]]]:
    mappings: Dict[str, Dict[Any, int]] = {}

    def map_series(series: pd.Series, mapping: Dict[Any, int]) -> np.ndarray:
        coded = series.map(mapping).fillna(0).astype(np.int32)
        return coded.to_numpy()

    train_encoded: List[np.ndarray] = []
    val_encoded: List[np.ndarray] = []
    test_encoded: List[np.ndarray] = []

    for col in categorical_cols:
        uniques = X_train[col].dropna().unique().tolist()
        mapping = {value: idx + 1 for idx, value in enumerate(uniques)}
        mappings[col] = mapping
        train_encoded.append(map_series(X_train[col], mapping))
        val_encoded.append(map_series(X_val[col], mapping))
        test_encoded.append(map_series(X_test[col], mapping))

    train_array = np.stack(train_encoded, axis=1).astype(np.float32) if train_encoded else np.zeros((len(X_train), 0), dtype=np.float32)
    val_array = np.stack(val_encoded, axis=1).astype(np.float32) if val_encoded else np.zeros((len(X_val), 0), dtype=np.float32)
    test_array = np.stack(test_encoded, axis=1).astype(np.float32) if test_encoded else np.zeros((len(X_test), 0), dtype=np.float32)
    return train_array, val_array, test_array, mappings


def assemble_arrays(
    numeric_train: np.ndarray,
    numeric_val: np.ndarray,
    numeric_test: np.ndarray,
    categorical_train: np.ndarray,
    categorical_val: np.ndarray,
    categorical_test: np.ndarray,
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    train_features = np.concatenate([numeric_train, categorical_train], axis=1)
    val_features = np.concatenate([numeric_val, categorical_val], axis=1)
    test_features = np.concatenate([numeric_test, categorical_test], axis=1)
    return train_features, val_features, test_features

In [6]:
def threshold_metrics(y_true: np.ndarray, y_prob: np.ndarray, threshold: float) -> Dict[str, Any]:
    preds = (y_prob >= threshold).astype(int)
    tp = int(np.sum((preds == 1) & (y_true == 1)))
    fp = int(np.sum((preds == 1) & (y_true == 0)))
    fn = int(np.sum((preds == 0) & (y_true == 1)))
    tn = int(np.sum((preds == 0) & (y_true == 0)))
    precision = tp / (tp + fp) if tp + fp else 0.0
    recall = tp / (tp + fn) if tp + fn else 0.0
    fpr = fp / (fp + tn) if fp + tn else 0.0
    tnr = tn / (tn + fp) if tn + fp else 0.0
    denom = precision + recall
    f1 = (2 * precision * recall / denom) if denom else 0.0
    return {
        'threshold': threshold,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'fpr': fpr,
        'tnr': tnr,
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'tn': tn,
    }


def select_thresholds(
    y_true: np.ndarray,
    y_prob: np.ndarray,
    target_fpr: float | None,
) -> Dict[str, Dict[str, Any]]:
    results: Dict[str, Dict[str, Any]] = {}
    results['default'] = threshold_metrics(y_true, y_prob, 0.5)

    precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
    thresholds_extended = np.append(thresholds, 1.0)
    denom = precision + recall
    f1_scores = np.divide(
        2 * precision * recall,
        denom,
        out=np.zeros_like(denom),
        where=denom > 0,
    )
    best_idx = int(f1_scores.argmax())
    best_threshold = float(thresholds_extended[best_idx])
    results['best_f1'] = threshold_metrics(y_true, y_prob, best_threshold)

    if target_fpr is not None:
        grid = np.linspace(0.0, 1.0, num=501)
        viable: List[Dict[str, Any]] = []
        for candidate in grid:
            metrics = threshold_metrics(y_true, y_prob, float(candidate))
            if metrics['fpr'] <= target_fpr:
                viable.append(metrics)
        if viable:
            results['target_fpr'] = max(viable, key=lambda item: item['recall'])
    return results


def evaluate_split(
    model: xgb.XGBClassifier,
    features: np.ndarray,
    labels: np.ndarray,
    target_fpr: float | None,
) -> Dict[str, Any]:
    y_prob = model.predict_proba(features)[:, 1]
    roc_auc = roc_auc_score(labels, y_prob)
    precision, recall, _ = precision_recall_curve(labels, y_prob)
    pr_auc = auc(recall, precision)
    thresholds = select_thresholds(labels, y_prob, target_fpr)
    return {
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'thresholds': thresholds,
        'positive_rate': float(labels.mean()),
    }

In [7]:
config = CONFIG.copy()
df = load_transactions(Path(config['dataset']), config['max_samples'])
X, y = engineer_features(df)
numeric_cols = [
    'amount_received',
    'amount_paid',
    'amount_diff',
    'amount_ratio',
    'hour',
    'dayofweek',
    'month',
    'is_weekend',
    'same_bank',
    'same_account',
    'is_round_amount',
]
categorical_cols = [
    'from_bank',
    'to_bank',
    'receiving_currency',
    'payment_currency',
    'payment_format',
]
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    X, y,
    test_size=config['test_size'],
    val_size=config['val_size'],
    random_state=config['random_state'],
)
num_train, num_val, num_test, scaler = prepare_numeric(X_train, X_val, X_test, numeric_cols)
cat_train, cat_val, cat_test, mappings = encode_categorical(X_train, X_val, X_test, categorical_cols)
train_features, val_features, test_features = assemble_arrays(num_train, num_val, num_test, cat_train, cat_val, cat_test)
y_train_np = y_train.to_numpy()
y_val_np = y_val.to_numpy()
y_test_np = y_test.to_numpy()
negatives = float((y_train_np == 0).sum())
positives = float((y_train_np == 1).sum())
scale_pos_weight = negatives / max(positives, 1.0)
model = xgb.XGBClassifier(
    n_estimators=config['n_estimators'],
    learning_rate=config['learning_rate'],
    max_depth=config['max_depth'],
    subsample=config['subsample'],
    colsample_bytree=config['colsample_bytree'],
    gamma=config['gamma'],
    reg_lambda=config['reg_lambda'],
    objective='binary:logistic',
    tree_method='hist',
    eval_metric='aucpr',
    scale_pos_weight=scale_pos_weight,
    random_state=config['random_state'],
)
model.fit(train_features, y_train_np, eval_set=[(val_features, y_val_np)], verbose=False)
val_metrics = evaluate_split(model, val_features, y_val_np, config['target_fpr'])
test_metrics = evaluate_split(model, test_features, y_test_np, config['target_fpr'])
print('=== Validation Metrics ===')
print(f"ROC AUC: {val_metrics['roc_auc']:.4f}")
print(f"PR AUC : {val_metrics['pr_auc']:.4f}")
print(f"Positive prevalence: {val_metrics['positive_rate']:.4%}")
for name, detail in val_metrics['thresholds'].items():
    print(f"\nThreshold strategy: {name}")
    for key, value in detail.items():
        if key in {'tp', 'fp', 'fn', 'tn'}:
            print(f"  {key.upper():<3}: {value}")
        else:
            print(f"  {key:<10}: {value:.4f}")
print('\n=== Test Metrics ===')
print(f"ROC AUC: {test_metrics['roc_auc']:.4f}")
print(f"PR AUC : {test_metrics['pr_auc']:.4f}")
print(f"Positive prevalence: {test_metrics['positive_rate']:.4%}")
for name, detail in test_metrics['thresholds'].items():
    print(f"\nThreshold strategy: {name}")
    for key, value in detail.items():
        if key in {'tp', 'fp', 'fn', 'tn'}:
            print(f"  {key.upper():<3}: {value}")
        else:
            print(f"  {key:<10}: {value:.4f}")
if config['report']:
    report_path = Path(config['report'])
    report_path.parent.mkdir(parents=True, exist_ok=True)
    payload = {
        'validation': val_metrics,
        'test': test_metrics,
    }
    with report_path.open('w', encoding='utf-8') as handle:
        json.dump(payload, handle, indent=2)
    print(f"\nMetrics saved to {report_path}")

=== Validation Metrics ===
ROC AUC: 0.9644
PR AUC : 0.1080
Positive prevalence: 0.1020%

Threshold strategy: default
  threshold : 0.5000
  precision : 0.0107
  recall    : 0.8205
  f1        : 0.0211
  fpr       : 0.0774
  tnr       : 0.9226
  TP : 425
  FP : 39250
  FN : 93
  TN : 468067

Threshold strategy: best_f1
  threshold : 0.9813
  precision : 0.1840
  recall    : 0.1776
  f1        : 0.1807
  fpr       : 0.0008
  tnr       : 0.9992
  TP : 92
  FP : 408
  FN : 426
  TN : 506909

Threshold strategy: target_fpr
  threshold : 0.6460
  precision : 0.0155
  recall    : 0.7683
  f1        : 0.0304
  fpr       : 0.0499
  tnr       : 0.9501
  TP : 398
  FP : 25292
  FN : 120
  TN : 482025

=== Test Metrics ===
ROC AUC: 0.9689
PR AUC : 0.0943
Positive prevalence: 0.1019%

Threshold strategy: default
  threshold : 0.5000
  precision : 0.0112
  recall    : 0.8551
  f1        : 0.0220
  fpr       : 0.0773
  tnr       : 0.9227
  TP : 885
  FP : 78466
  FN : 150
  TN : 936168

Threshold str