In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
import joblib

# Load the dataset
credit_card_data = pd.read_csv(r"creditcard.csv")

# Check for missing values
print("Missing values in dataset:", credit_card_data.isnull().sum())

# Check class distribution
print("Class distribution:\n", credit_card_data['Class'].value_counts())

# Split features and target variable
X = credit_card_data.drop(columns='Class', axis=1)
Y = credit_card_data['Class']

# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE for oversampling to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, Y_resampled = smote.fit_resample(X_train_scaled, Y_train)

# Apply Tomek Links after oversampling to remove borderline examples
tomek = TomekLinks()
X_resampled, Y_resampled = tomek.fit_resample(X_resampled, Y_resampled)

# Train XGBoost model
xgb = XGBClassifier(random_state=42, eval_metric="logloss", n_jobs=-1)
xgb.fit(X_resampled, Y_resampled)

# Evaluate the model
predictions = xgb.predict(X_test_scaled)
print("\nXGBoost Model with SMOTE and Tomek Links")
print(f"Accuracy: {accuracy_score(Y_test, predictions):.4f}")
print(classification_report(Y_test, predictions, target_names=['Class 0', 'Class 1']))

# Save the XGBoost model
joblib.dump(xgb, "xgb_model_tomek_smote.pkl")
print("Model saved as 'xgb_model_tomek_smote.pkl'.")


Missing values in dataset: Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
Class distribution:
 Class
0    284315
1       492
Name: count, dtype: int64

XGBoost Model with SMOTE and Tomek Links
Accuracy: 0.9992
              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00     56864
     Class 1       0.73      0.85      0.78        98

    accuracy                           1.00     56962
   macro avg       0.86      0.92      0.89     56962
weighted avg       1.00      1.00      1.00     56962

Model saved as 'xgb_model_tomek_smote.pkl'.
