In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, recall_score
import gc
import os
import matplotlib.pyplot as plt

In [2]:
df = pd.read_parquet('../data/processed/train.parquet')
X = df.drop(['isFraud', 'TransactionID', 'TransactionDT'], axis=1)
y = df['isFraud'].values
X = X.fillna(-999)

In [3]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Train Shape: {X_train.shape}, Val Shape: {X_val.shape}")

del df
gc.collect()

Train Shape: (472432, 443), Val Shape: (118108, 443)


10

In [4]:
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,       # Standard baseline
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'n_jobs': -1,           # Use all CPU cores
    'seed': 42,
    'verbose': -1
}

In [5]:
lgbm_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, val_data],
    callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(100)]
)

Training until validation scores don't improve for 50 rounds
[100]	training's auc: 0.927523	valid_1's auc: 0.918718
[200]	training's auc: 0.947398	valid_1's auc: 0.93593
[300]	training's auc: 0.957646	valid_1's auc: 0.943067
[400]	training's auc: 0.964976	valid_1's auc: 0.948231
[500]	training's auc: 0.971893	valid_1's auc: 0.952332
[600]	training's auc: 0.976695	valid_1's auc: 0.954879
[700]	training's auc: 0.980715	valid_1's auc: 0.957459
[800]	training's auc: 0.98359	valid_1's auc: 0.959557
[900]	training's auc: 0.985882	valid_1's auc: 0.961171
[1000]	training's auc: 0.988051	valid_1's auc: 0.96259
Did not meet early stopping. Best iteration is:
[1000]	training's auc: 0.988051	valid_1's auc: 0.96259


In [6]:
os.makedirs('../outputs/models/', exist_ok=True)
lgbm_model.save_model('../outputs/models/lgbm_baseline.txt')
print("✅ Model saved to ../outputs/models/lgbm_baseline.txt")

✅ Model saved to ../outputs/models/lgbm_baseline.txt
