In [None]:
# Cell 1
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", category=UserWarning)

# Load the engineered parquet files
train = pd.read_parquet('../data/processed/train.parquet')

# Define features and target
# For the baseline, we use isFraud as the proxy for high-risk credit behavior
X = train.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1)
y = train['isFraud'].values

# Identify categorical indexes for TabNet
# TabNet needs the position (index) of categorical columns
cat_cols = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain',
            'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
            'DeviceType', 'DeviceInfo']

cat_idxs = [i for i, f in enumerate(X.columns) if f in cat_cols] # cat_cols from 02.ipynb
cat_dims = [train[f].nunique() for f in X.columns if f in cat_cols]

# 1. Fill NaNs first (using a positive integer for categories to avoid IndexErrors)
# In TabNet, it's safer to use a large positive number like 9999 for missing categories
for col in cat_cols:
    X[col] = X[col].fillna(X[col].max() + 1)

X = X.fillna(-999) # Numerical fill

# 2. Re-calculate Dimensions based on current X
# We use max() + 1 because embedding layers index from 0 to max_value
cat_idxs = [i for i, f in enumerate(X.columns) if f in cat_cols]
cat_dims = [int(X[f].max() + 1) for f in X.columns if f in cat_cols]

# 3. Ensure all types are float32 (Required for PyTorch backend)
X = X.astype(np.float32)

# 4. Re-split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Features: {X.shape[1]}, Categories: {len(cat_idxs)}")

Features: 443, Categories: 31


In [None]:
# Cell 2
# Initialize TabNet Classifier
clf = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax' # Better for sparse financial data
)

# Train the model
clf.fit(
    X_train=X_train.values, y_train=y_train,
    eval_set=[(X_train.values, y_train), (X_valid.values, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=20,
    patience=5,
    batch_size=1024, # Smaller batch size for 16GB RAM
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

epoch 0  | loss: 0.13542 | train_auc: 0.80235 | valid_auc: 0.80823 |  0:02:57s
epoch 1  | loss: 0.11874 | train_auc: 0.81678 | valid_auc: 0.82029 |  0:05:33s
epoch 2  | loss: 0.11347 | train_auc: 0.8498  | valid_auc: 0.85138 |  0:08:24s
epoch 3  | loss: 0.11166 | train_auc: 0.83448 | valid_auc: 0.83358 |  0:10:55s
epoch 4  | loss: 0.10878 | train_auc: 0.84228 | valid_auc: 0.83982 |  0:13:29s
epoch 5  | loss: 0.10545 | train_auc: 0.85274 | valid_auc: 0.85159 |  0:15:59s
epoch 6  | loss: 0.10299 | train_auc: 0.85219 | valid_auc: 0.84968 |  0:18:27s
epoch 7  | loss: 0.10115 | train_auc: 0.85055 | valid_auc: 0.84216 |  0:20:57s
epoch 8  | loss: 0.09923 | train_auc: 0.86631 | valid_auc: 0.8591  |  0:23:26s
epoch 9  | loss: 0.09834 | train_auc: 0.86728 | valid_auc: 0.86184 |  0:25:56s
epoch 10 | loss: 0.09681 | train_auc: 0.86288 | valid_auc: 0.85486 |  0:28:26s
epoch 11 | loss: 0.0975  | train_auc: 0.87604 | valid_auc: 0.86933 |  0:30:56s
epoch 12 | loss: 0.096   | train_auc: 0.87497 | vali

In [None]:
# Cell 3
preds = clf.predict_proba(X_valid.values)[:, 1]
valid_auc = roc_auc_score(y_valid, preds)
print(f"Final Validation AUC: {valid_auc:.4f}")

# Save the baseline model for Month 2 comparison
import os
clf.save_model('../outputs/models/tabnet_credit_baseline')

Final Validation AUC: 0.8845
Successfully saved model at ../outputs/models/tabnet_credit_baseline.zip


'../outputs/models/tabnet_credit_baseline.zip'