In [32]:
import numpy as np
import lightgbm as lgb
from scipy import special
import pandas as pd
from sklearn import model_selection
from sklearn import metrics


from watermark import watermark
print(watermark())
print(watermark(packages="numpy,scipy,lightgbm,pandas,sklearn"))


Last updated: 2025-03-17T13:55:07.087982+01:00

Python implementation: CPython
Python version       : 3.12.5
IPython version      : 9.0.2

Compiler    : Clang 18.1.8 
OS          : Darwin
Release     : 24.3.0
Machine     : arm64
Processor   : arm
CPU cores   : 11
Architecture: 64bit

numpy   : 2.2.3
scipy   : 1.15.2
lightgbm: 4.6.0
pandas  : 2.2.3
sklearn : 1.6.1



In [9]:
df = pd.read_csv('https://raw.githubusercontent.com/shalakasaraogi/credit-card-fraud-detection/refs/heads/main/csv%20files/creditcard.csv')
X = df.drop(columns='Class')
y = df['Class']

In [10]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,random_state=42)
X_fit, X_val, y_fit, y_val = model_selection.train_test_split(X_train, y_train, random_state=42)

In [11]:
X_train.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
5514,36243,-1.043258,0.423002,1.597521,1.255246,0.814673,2.583459,0.740716,0.839617,-0.887058,...,-0.107516,0.236455,0.625517,0.14999,-1.393902,-0.177389,0.005289,0.108046,0.102612,139.54
1266,1041,0.892379,-0.345991,0.05128,0.741072,-0.452616,-0.724437,0.415932,-0.215842,-0.235292,...,0.271976,0.108916,0.080448,-0.254846,0.638831,0.568469,0.503576,-0.083778,0.024184,173.0
5864,36398,-0.317209,0.613389,2.665831,1.595787,-0.959235,-0.174069,-0.307393,0.158075,-0.090407,...,0.201814,0.405945,1.228985,-0.069935,0.744616,-0.549266,-0.09568,0.232602,0.180096,21.07
15865,40721,-0.96816,-0.623144,1.213146,-0.168551,0.320251,0.879488,-0.012024,0.130865,-0.976997,...,-0.53048,-0.269016,0.131881,0.403562,-1.324961,0.417803,-0.13095,0.126224,-0.078357,79.0
12892,39432,0.990027,-0.176854,0.284344,1.542821,-0.532993,-0.630936,0.226393,-0.124951,0.435417,...,0.005939,-0.073231,-0.314321,-0.17918,0.378742,0.661057,-0.350789,-0.004524,0.036824,120.99


In [13]:
fit = lgb.Dataset(X_fit, y_fit)
val = lgb.Dataset(X_val, y_val, reference=fit)

In [36]:
def logloss_metric(preds, train_data):
    y = train_data.get_label()
    p = special.expit(preds)

    ll = np.empty_like(p)
    pos = y == 1
    ll[pos] = np.log(p[pos])
    ll[~pos] = np.log(1 - p[~pos])

    is_higher_better = False
    return 'logloss', -ll.mean(), is_higher_better


model = lgb.train(
    params={
        'learning_rate': 0.01,
        'objective': 'binary'
    },
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    callbacks=[
            lgb.early_stopping(stopping_rounds=20),  # Early stopping callback
            lgb.log_evaluation(period=20)           # Verbosity callback
    ],
    feval=logloss_metric
)

y_pred = model.predict(X_test)

print()
print("=" * 22)
print(f"Test's ROC AUC: {metrics.roc_auc_score(y_test, y_pred):.3f}")
print(f"Test's logloss: {metrics.log_loss(y_test, y_pred):.3f}")
print("=" * 22)


# Training + Prediction on `Test`

[LightGBM] [Info] Number of positive: 43, number of negative: 11207
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000958 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 11250, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.003822 -> initscore=-5.563094
[LightGBM] [Info] Start training from score -5.563094
Training until validation scores don't improve for 20 rounds
[20]	fit's binary_logloss: 0.0131052	fit's logloss: 0.69471	val's binary_logloss: 0.0180189	val's logloss: 0.694851
[40]	fit's binary_logloss: 0.00918859	fit's logloss: 0.694265	val's binary_logloss: 0.017663	val's logloss: 0.694591
[60]	fit's binary_logloss: 0.00698558	fit's logloss: 0.693865	val's binary_logloss: 0.0174784	val's logloss: 0.694352
[80]	fit's binary_logloss: 0.00554444	fit's logloss: 0.693522	val's binary_logloss: 0.0174456	val

In [41]:
def logloss_objective(preds, train_data):
    y = train_data.get_label()  # True labels
    p = special.expit(preds)    # Sigmoid to convert raw predictions to probabilities
    grad = p - y                # Gradient of log loss
    hess = p * (1 - p)          # Hessian of log loss
    return grad, hess

def logloss_init_score(y):
    p = y.mean()
    p = np.clip(p, 1e-15, 1 - 1e-15)  # never hurts
    log_odds = np.log(p / (1 - p))
    return log_odds

fit = lgb.Dataset(
    X_fit, y_fit, init_score=np.full_like(y_fit, logloss_init_score(y_fit), dtype=float)
)

val = lgb.Dataset(
    X_val, y_val, init_score=np.full_like(y_val, logloss_init_score(y_fit), dtype=float), reference=fit
)


model = lgb.train(
    params={
        'learning_rate': 0.01,
        'objective': logloss_objective
    },
    train_set=fit,
    num_boost_round=10000,
    valid_sets=(fit, val),
    valid_names=('fit', 'val'),
    callbacks=[
            lgb.early_stopping(stopping_rounds=20),  # Early stopping callback
            lgb.log_evaluation(period=20)           # Verbosity callback
    ],
    feval=logloss_metric
)

y_pred = special.expit(logloss_init_score(y_fit) + model.predict(X_test))

print()
print("=" * 22)
print(f"Test's ROC AUC: {metrics.roc_auc_score(y_test, y_pred):.3f}")
print(f"Test's logloss: {metrics.log_loss(y_test, y_pred):.3f}")
print("=" * 22)

[LightGBM] [Info] Using self-defined objective function
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000633 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 11250, number of used features: 30
[LightGBM] [Info] Using self-defined objective function
Training until validation scores don't improve for 20 rounds
[20]	fit's logloss: 0.0131052	val's logloss: 0.0180189
[40]	fit's logloss: 0.00918859	val's logloss: 0.017663
[60]	fit's logloss: 0.00698558	val's logloss: 0.0174784
[80]	fit's logloss: 0.00554444	val's logloss: 0.0174456
Early stopping, best iteration is:
[75]	fit's logloss: 0.00586475	val's logloss: 0.0174426

Test's ROC AUC: 0.479
Test's logloss: 0.013
