In [1]:
!pip install pandas numpy scikit-learn xgboost optuna kagglehub matplotlib seaborn joblib plotly imbalanced-learn

Collecting optuna
  Downloading optuna-4.7.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.7.0-py3-none-any.whl (413 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.9/413.9 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.7.0


In [4]:
import pandas as pd
import numpy as np
import os
import kagglehub
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import matplotlib.pyplot as plt

# Set the working directory for colab
project_dir = '/content/credit-card-fraud-detection'
os.chdir(project_dir)
print(f"Current working directory: {os.getcwd()}")

# Now you can import directly from the local file
from models import XGBoostModel

Current working directory: /content/credit-card-fraud-detection


In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
df = pd.read_csv(os.path.join(path, "creditcard.csv"))

print(f"Dataset loaded: {df.shape}")
print(f"Fraud count: {df['Class'].sum()} ({df['Class'].sum()/len(df)*100:.4f}%)")

Using Colab cache for faster access to the 'creditcardfraud' dataset.
Dataset loaded: (284807, 31)
Fraud count: 492 (0.1727%)


In [7]:
X = df.drop(['Class', 'Time'], axis=1)
y = df['Class']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (284807, 29)
Target shape: (284807,)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=34666,stratify=y)

print(f"\nTrain samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Train frauds: {sum(y_train)} ({sum(y_train)/len(y_train)*100:.4f}%)")
print(f"Test frauds: {sum(y_test)} ({sum(y_test)/len(y_test)*100:.4f}%)")


Train samples: 227845
Test samples: 56962
Train frauds: 394 (0.1729%)
Test frauds: 98 (0.1720%)


In [9]:
n_fraud_original = sum(y_train == 1)
target_frauds = int(n_fraud_original * 2)

print(f"Original frauds: {n_fraud_original}")

smote = SMOTE(sampling_strategy={1: target_frauds},k_neighbors=5,random_state=3366)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"After SMOTE (2x): {sum(y_train_resampled == 1)} frauds")
print(f"Created {sum(y_train_resampled == 1) - n_fraud_original} synthetic frauds")
print(f"Total training samples: {len(X_train_resampled)}")

Original frauds: 394
After SMOTE (2x): 788 frauds
Created 394 synthetic frauds
Total training samples: 228239


In [10]:
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_resampled,
    y_train_resampled,
    test_size=0.2,
    random_state=34666,
    shuffle=True
)

print(f"Training samples: {len(X_train_final)}")
print(f"Validation samples: {len(X_val)}")
print(f"Test samples: {len(X_test)}")

Training samples: 182591
Validation samples: 45648
Test samples: 56962


In [11]:
feature_names = [f'V{i}' for i in range(1, 29)] + ['Amount']

X_train_df = pd.DataFrame(X_train_final, columns=feature_names)
X_val_df = pd.DataFrame(X_val, columns=feature_names)
X_test_df = pd.DataFrame(X_test, columns=feature_names)

y_train_series = pd.Series(y_train_final.values if hasattr(y_train_final, 'values') else y_train_final)
y_val_series = pd.Series(y_val.values if hasattr(y_val, 'values') else y_val)
y_test_series = pd.Series(y_test.values)

print("Data converted to DataFrames!")
print(f"X_train_df shape: {X_train_df.shape}")
print(f"X_val_df shape: {X_val_df.shape}")
print(f"X_test_df shape: {X_test_df.shape}")

Data converted to DataFrames!
X_train_df shape: (182591, 29)
X_val_df shape: (45648, 29)
X_test_df shape: (56962, 29)


In [12]:
param_distributions = {
    'n_estimators': (100, 500),
    'learning_rate': (0.01, 0.3),
    'max_depth': (3, 10),
    'min_child_weight': (1, 10),
    'subsample': (0.6, 1.0),
    'colsample_bytree': (0.6, 1.0),
    'gamma': (0.0, 5.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'scale_pos_weight': 577.0,
    'random_state': 355533,
    'use_gpu': True
}

In [14]:
import torch
gpu_available = torch.cuda.is_available()
print(f"\nPyTorch GPU available: {gpu_available}")
if gpu_available:
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")


PyTorch GPU available: True
GPU Name: Tesla T4


In [15]:
xgb_model = XGBoostModel(use_gpu=True, random_state=355533)
xgb_model

XGBoostModel(fitted=False, n_estimators=100, device=gpu)

In [16]:
tuning_results = xgb_model.tune_hyperparameters(
    X_train=X_train_df,
    y_train=y_train_series,
    X_val=X_val_df,
    y_val=y_val_series,
    param_distributions=param_distributions,
    n_trials=50,
    metric='pr_auc'
)

[I 2026-02-15 01:04:47,142] A new study created in memory with name: no-name-daed0cab-e1ed-4f97-8373-040f3e4b04e7


Starting hyperparameter tuning with 50 trials...
Optimizing for: pr_auc


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2026-02-15 01:04:49,717] Trial 0 finished with value: 0.9458493702327777 and parameters: {'n_estimators': 240, 'learning_rate': 0.12996248755230635, 'max_depth': 10, 'min_child_weight': 3, 'subsample': 0.7313791861551227, 'colsample_bytree': 0.9813078053986383, 'gamma': 2.665968726799695, 'reg_alpha': 0.18680252486400095, 'reg_lambda': 0.4859266303923111}. Best is trial 0 with value: 0.9458493702327777.
[I 2026-02-15 01:04:50,958] Trial 1 finished with value: 0.9482311615493856 and parameters: {'n_estimators': 195, 'learning_rate': 0.21822244049315156, 'max_depth': 6, 'min_child_weight': 7, 'subsample': 0.7926992533738794, 'colsample_bytree': 0.8413336558637108, 'gamma': 3.637872169595659, 'reg_alpha': 0.19374496902894012, 'reg_lambda': 0.8420885572193478}. Best is trial 1 with value: 0.9482311615493856.
[I 2026-02-15 01:04:52,221] Trial 2 finished with value: 0.9456935824501527 and parameters: {'n_estimators': 167, 'learning_rate': 0.18491165673191906, 'max_depth': 7, 'min_child_we

In [17]:
print("="*70)
print("BEST HYPERPARAMETERS")
print("="*70)

best_params = tuning_results['best_params']
best_score = tuning_results['best_score']

print(f"\nBest PR-AUC: {best_score:.4f}")
print("\nBest Parameters:")
for key, value in best_params.items():
    print(f"  {key}: {value}")

BEST HYPERPARAMETERS

Best PR-AUC: 0.9563

Best Parameters:
  n_estimators: 271
  learning_rate: 0.07289590286774396
  max_depth: 10
  min_child_weight: 1
  subsample: 0.6259989306417418
  colsample_bytree: 0.870047895607214
  gamma: 0.816411782606675
  reg_alpha: 0.567984473554783
  reg_lambda: 0.6847738823413013


In [18]:
import optuna.visualization as vis

study = tuning_results['study']

fig = vis.plot_optimization_history(study)
fig.update_layout(width=1200, height=600, title_font_size=20)
fig.show()

In [19]:
fig = vis.plot_param_importances(study)
fig.update_layout(width=1200, height=700, title_font_size=20)
fig.show()

In [20]:
import joblib
joblib.dump(study, 'xgboost_optuna_study.pkl')
print("Optuna study saved to: xgboost_optuna_study.pkl")

Optuna study saved to: xgboost_optuna_study.pkl
