In [None]:
import pandas as pd
import numpy as np
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the dataset
df = pd.read_csv("Fraud.csv")  # Replace with actual dataset file

# Count occurrences of Fraud (1) and Non-Fraud (0)
fraud_counts = df["isFraud"].value_counts()
print(f"Non-Fraud (0) Transactions: {fraud_counts.get(0, 0)}")
print(f"Fraud (1) Transactions: {fraud_counts.get(1, 0)}")


Non-Fraud (0) Transactions: 6354407
Fraud (1) Transactions: 8213


In [None]:
# Display percentages
total_transactions = df.shape[0]
fraud_percentage = (fraud_counts.get(1, 0) / total_transactions) * 100
non_fraud_percentage = (fraud_counts.get(0, 0) / total_transactions) * 100
print(f"Non-Fraud Percentage: {non_fraud_percentage:.4f}%")
print(f"Fraud Percentage: {fraud_percentage:.4f}%")

Non-Fraud Percentage: 99.8709%
Fraud Percentage: 0.1291%


In [None]:
# Step 1: Data Cleaning
df = df.drop_duplicates()  # Remove duplicates
df = df.dropna()  # Remove missing values

# Step 2: Encode categorical variables
label_encoders = {}
for col in ["type", "nameOrig", "nameDest"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoders for later use

In [None]:
# Step 3: Define Features & Target
X = df.drop(columns=["isFraud"])  # Features
y = df["isFraud"]  # Target variable

In [None]:
# Step 4: Apply SMOTE to balance the dataset
smote = SMOTE(sampling_strategy=1.0, random_state=42)  # Make Fraud = Non-Fraud
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# Step 4.1: Display the new class distribution after SMOTE
fraud_counts_resampled = np.bincount(y_resampled)

print(f"Non-Fraud (0) Transactions After SMOTE: {fraud_counts_resampled[0]}")
print(f"Fraud (1) Transactions After SMOTE: {fraud_counts_resampled[1]}")

# Optional: Display percentages
total_transactions_resampled = len(y_resampled)
fraud_percentage_resampled = (fraud_counts_resampled[1] / total_transactions_resampled) * 100
non_fraud_percentage_resampled = (fraud_counts_resampled[0] / total_transactions_resampled) * 100

print(f"Non-Fraud Percentage After SMOTE: {non_fraud_percentage_resampled:.2f}%")
print(f"Fraud Percentage After SMOTE: {fraud_percentage_resampled:.2f}%")

Non-Fraud (0) Transactions After SMOTE: 6354407
Fraud (1) Transactions After SMOTE: 6354407
Non-Fraud Percentage After SMOTE: 50.00%
Fraud Percentage After SMOTE: 50.00%


In [None]:
# Step 5: Split Data into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

In [None]:
# Step 6: Train the Random Forest Model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
model.fit(X_train, y_train)

In [None]:
# Save the trained model & encoders
joblib.dump(model, "fraud_detection_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")

# Step 7: Evaluate the Model
y_pred = model.predict(X_test)
print("\n🔍 Model Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))