In [None]:
# Import Necessary Libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score, roc_curve,
    precision_recall_curve, auc, average_precision_score
)
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the Data
data = pd.read_csv('https://classfiles.blob.core.windows.net/files/creditcard.csv')

In [None]:
# View first few rows
print(data.head())

# Summary statistics
print(data.describe())

# Class distribution
print(data['Class'].value_counts())

# Plot class distribution
sns.countplot(x='Class', data=data)
plt.title('Class Distribution')
plt.show()

In [None]:
# Separate Features and Target Variable
X = data.drop('Class', axis=1)
y = data['Class']

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit on the 'Amount' and 'Time' features
X[['Time', 'Amount']] = scaler.fit_transform(X[['Time', 'Amount']])

In [None]:
# Split the Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# Handle Class Imbalance with SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

# Verify the new class distribution
print(pd.Series(y_resampled).value_counts())

In [None]:
# Train the Model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_resampled, y_resampled)

In [None]:
# Make Predictions
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)[:, 1]

In [None]:
# Evaluate the Model
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.title('Confusion Matrix')
plt.ylabel('Actual Class')
plt.xlabel('Predicted Class')
plt.show()

In [None]:
# Classification Report
print(classification_report(y_test, y_pred))

In [None]:
# ROC AUC Score
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f'ROC AUC Score: {roc_auc:.4f}')

In [None]:
# Precision-Recall Curve and AUC
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)
avg_precision = average_precision_score(y_test, y_pred_prob)
print(f'Precision-Recall AUC: {pr_auc:.4f}')
print(f'Average Precision Score: {avg_precision:.4f}')

In [None]:
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'r--')
plt.title('Receiver Operating Characteristic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
# Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'PR Curve (AUC = {pr_auc:.4f})')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.show()

In [None]:
# Adjust Class Weights
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_train, y_train)

In [None]:
# Cross-Validation with Stratified K-Folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(
    clf, X_resampled, y_resampled, cv=skf, scoring='roc_auc'
)
print(f'Cross-validated ROC AUC scores: {cv_scores}')
print(f'Mean ROC AUC score: {cv_scores.mean():.4f}')

In [None]:
import joblib

# Save the trained model
joblib.dump(clf, 'fraud_detection_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')