In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report


In [3]:
# Load Dataset
df = pd.read_csv("creditcard.csv")
print("Dataset Loaded Successfully!")

Dataset Loaded Successfully!


In [8]:
df.shape

(284807, 31)

In [4]:

# Data Exploration
print(df.head())
print("Class Distribution:\n", df['Class'].value_counts())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [5]:
# Standardize 'Amount' and 'Time'
scaler = StandardScaler()
df[['Time', 'Amount']] = scaler.fit_transform(df[['Time', 'Amount']])

In [6]:
# Check for null values
print("Missing Values in Dataset:\n", df.isnull().sum())

Missing Values in Dataset:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


In [7]:
# Drop rows with missing values (if any)
df = df.dropna()
print("Dataset Shape after removing missing values:", df.shape)

Dataset Shape after removing missing values: (284807, 31)


In [10]:
# Split features and target
X = df.drop(columns=['Class'])
y = df['Class']

In [11]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("Resampling Completed. New Class Distribution:\n", pd.Series(y_resampled).value_counts())



Resampling Completed. New Class Distribution:
 Class
0    284315
1    284315
Name: count, dtype: int64


In [12]:
# Handle Class Imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("Resampling Completed. New Class Distribution:\n", pd.Series(y_resampled).value_counts())



Resampling Completed. New Class Distribution:
 Class
0    284315
1    284315
Name: count, dtype: int64


In [13]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [14]:
# Train Model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
model.fit(X_train, y_train)
print("Model Training Completed!")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Model Training Completed!


In [15]:

# Save Model
joblib.dump(model, "fraud_model.pkl")
print("Model Saved Successfully!")


Model Saved Successfully!


In [16]:

# Evaluate Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9997362080790673
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56750
           1       1.00      1.00      1.00     56976

    accuracy                           1.00    113726
   macro avg       1.00      1.00      1.00    113726
weighted avg       1.00      1.00      1.00    113726



In [18]:
# Predict on Sample Transaction
sample_transaction = np.array([0, -1.36, -0.07, 2.53, 1.37, -0.34, 0.46, 0.24, 0.10, 0.36, -0.02, 0.28, -0.11, 0.07, 0.13, -0.19, 0.13, -0.02, 149.62]).reshape(1, -1)

# Select a random sample from X_test
sample_transaction = X_test.iloc[0].values.reshape(1, -1)

# Predict fraud or not
prediction = model.predict(sample_transaction)

# Display result
print("Fraud Detected!" if prediction[0] == 1 else "Transaction is Safe.")

prediction = model.predict(sample_transaction)
print("Fraud Detected!" if prediction[0] == 1 else "Transaction is Safe.")


Fraud Detected!
Fraud Detected!


In [19]:
# Select a random non-fraudulent transaction
sample_transaction = X_test[y_test == 0].iloc[0].values.reshape(1, -1)

# Predict fraud or not
prediction = model.predict(sample_transaction)

# Display result
print("Fraud Detected!" if prediction[0] == 1 else "Transaction is Safe.")

# Show the sample transaction values
print("Sample Transaction Features:\n", sample_transaction)

Transaction is Safe.
Sample Transaction Features:
 [[-1.90672886e+00 -8.51993654e-01  9.35552982e-01  2.18813650e+00
   7.09286015e-01  1.78929529e-01 -3.49334702e-01  8.57609496e-01
  -4.16251530e-01  1.06637507e+00 -4.74828093e-01  5.47065628e-01
  -1.83600921e+00  2.85384552e+00  7.66582287e-01 -7.97916804e-01
  -8.39402074e-01  8.73381048e-01 -3.71731882e-01  8.90543646e-01
   2.70932774e-01 -4.11022437e-01 -4.04761630e-01 -9.68926976e-02
   4.04475934e-01  3.77388443e-01  3.44173040e-01 -3.11257097e-04
  -1.25090224e-01 -2.31167821e-01]]
