In [None]:
# Cell 1
import pandas as pd
import numpy as np
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import torch
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", category=UserWarning)

# Load the engineered parquet files
train = pd.read_parquet('../data/processed/train.parquet')

# Define features and target
# For the baseline, we use isFraud as the proxy for high-risk credit behavior
X = train.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1)
y = train['isFraud'].values

# Identify categorical indexes for TabNet
# TabNet needs the position (index) of categorical columns
cat_cols = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain',
            'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
            'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
            'DeviceType', 'DeviceInfo']

cat_idxs = [i for i, f in enumerate(X.columns) if f in cat_cols] # cat_cols from 02.ipynb
cat_dims = [train[f].nunique() for f in X.columns if f in cat_cols]

# 1. Fill NaNs first (using a positive integer for categories to avoid IndexErrors)
# In TabNet, it's safer to use a large positive number like 9999 for missing categories
for col in cat_cols:
    X[col] = X[col].fillna(X[col].max() + 1)

X = X.fillna(-999) # Numerical fill

# 2. Re-calculate Dimensions based on current X
# We use max() + 1 because embedding layers index from 0 to max_value
cat_idxs = [i for i, f in enumerate(X.columns) if f in cat_cols]
cat_dims = [int(X[f].max() + 1) for f in X.columns if f in cat_cols]

# 3. Ensure all types are float32 (Required for PyTorch backend)
X = X.astype(np.float32)

# 4. Re-split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Features: {X.shape[1]}, Categories: {len(cat_idxs)}")

Features: 443, Categories: 31


In [2]:
# Cell 2
# Initialize TabNet Classifier
clf = TabNetClassifier(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=1,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":50, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='entmax' # Better for sparse financial data
)

# Train the model
clf.fit(
    X_train=X_train.values, y_train=y_train,
    eval_set=[(X_train.values, y_train), (X_valid.values, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['auc'],
    max_epochs=20,
    patience=5,
    batch_size=1024, # Smaller batch size for 16GB RAM
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

epoch 0  | loss: 0.13781 | train_auc: 0.80556 | valid_auc: 0.80306 |  0:04:42s
epoch 1  | loss: 0.1187  | train_auc: 0.83284 | valid_auc: 0.83055 |  0:09:05s
epoch 2  | loss: 0.11079 | train_auc: 0.85295 | valid_auc: 0.8474  |  0:13:39s
epoch 3  | loss: 0.10868 | train_auc: 0.86121 | valid_auc: 0.85752 |  0:19:05s
epoch 4  | loss: 0.10455 | train_auc: 0.80841 | valid_auc: 0.80538 |  0:22:27s
epoch 5  | loss: 0.10273 | train_auc: 0.86542 | valid_auc: 0.86109 |  0:25:14s
epoch 6  | loss: 0.10115 | train_auc: 0.8458  | valid_auc: 0.84062 |  0:28:10s
epoch 7  | loss: 0.09875 | train_auc: 0.86709 | valid_auc: 0.86368 |  0:30:56s
epoch 8  | loss: 0.09688 | train_auc: 0.88545 | valid_auc: 0.87874 |  0:33:37s
epoch 9  | loss: 0.09558 | train_auc: 0.87921 | valid_auc: 0.87211 |  0:36:20s
epoch 10 | loss: 0.09375 | train_auc: 0.8749  | valid_auc: 0.86889 |  0:39:03s
epoch 11 | loss: 0.0928  | train_auc: 0.85451 | valid_auc: 0.84908 |  0:41:46s
epoch 12 | loss: 0.0918  | train_auc: 0.86528 | vali

In [3]:
# Cell 3
preds = clf.predict_proba(X_valid.values)[:, 1]
valid_auc = roc_auc_score(y_valid, preds)
print(f"Final Validation AUC: {valid_auc:.4f}")

# Save the baseline model for Month 2 comparison
import os
clf.save_model('../outputs/models/tabnet_credit_baseline')

Final Validation AUC: 0.8787
Successfully saved model at ../outputs/models/tabnet_credit_baseline.zip


'../outputs/models/tabnet_credit_baseline.zip'