In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_curve, auc, classification_report, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
credit=pd.read_csv(r'creditcard.csv')

In [None]:
credit.head(7)

In [None]:
credit.info()

In [None]:
credit.describe()

In [None]:
credit['Class'].value_counts()

In [None]:
# here log scale is used as the dataset is highly imbalanced
plt.figure(figsize=(7, 5))
sns.countplot(x='Class', data=credit)
plt.yscale('log')
plt.title('Class Distribution (Logarithmic Scale)')
plt.xlabel('Class')
plt.ylabel('Count (log scale)')
plt.show()

In [None]:
# Normalize the 'Amount' and 'Time' features
scaler = StandardScaler()
credit['Amount'] = scaler.fit_transform(credit[['Amount']])
credit['Time'] = scaler.fit_transform(credit[['Time']])

In [None]:
# Define features and target variable
X = credit.drop(columns=['Class'])
y = credit['Class']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Handle class imbalance using SMOTE (Synthetic Minority Over-sampling Technique)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize the Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

In [None]:
import time
start_time = time.time()
rf_clf.fit(X_train_resampled, y_train_resampled)
end_time = time.time()

training_time = end_time - start_time

print(f"Time taken to train the Random Forest model: {training_time:.2f} seconds")

In [None]:
# Predict probabilities for the test set
y_probs = rf_clf.predict_proba(X_test)[:, 1]

In [None]:
# Predict classes for the test set
y_pred = rf_clf.predict(X_test)

In [None]:
# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
# Print precision, recall, and F1-score
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

In [None]:
# Calculate precision and recall values for the Precision-Recall curve
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_probs)

In [None]:
# Calculate the Area Under the Precision-Recall Curve (AUPRC)
auprc = auc(recall_curve, precision_curve)

# Print the AUPRC
print(f'Area Under the Precision-Recall Curve (AUPRC): {auprc:.4f}')

In [None]:
# Plot the Precision-Recall curve
plt.figure()
plt.plot(recall_curve, precision_curve, label=f'Random Forest (AUPRC = {auprc:.4f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.show()

In [None]:
# Print classification report
print(classification_report(y_test, y_pred, target_names=['Non-Fraud', 'Fraud']))