In [15]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [9]:
# Define paths based on project structure
data_path = "../data/processed/"
models_path = "../models/"
reports_path = "../reports/"


In [10]:
# Ensure the directory exists before saving processed files
import os

data_path = "../data/processed/"
os.makedirs(data_path, exist_ok=True)  # Creates the directory if it doesn't exist


In [40]:
# Load Processed Fraud Data
data_path = "../data/processed_fraud_data.csv"
fraud_data = pd.read_csv(data_path)

# 📌 Drop Non-Numeric Columns
drop_columns = ["user_id", "device_id", "sex"]  # Remove unnecessary text columns
fraud_data = fraud_data.drop(columns=drop_columns)

# 📌 Convert Datetime Columns to Unix Timestamps
datetime_columns = ["signup_time", "purchase_time"]
for col in datetime_columns:
    fraud_data[col] = pd.to_datetime(fraud_data[col], errors="coerce").astype(int) // 10**9

# 📌 Verify That All Columns Are Numeric
print(fraud_data.dtypes)  # Ensure all columns are int64 or float64

# Save the cleaned dataset
fraud_data.to_csv("../data/processed/cleaned_fraud_data.csv", index=False)

print("✅ Data Preparation Complete! Processed file saved as 'cleaned_fraud_data.csv'.")


signup_time            int64
purchase_time          int64
purchase_value       float64
source                 int64
browser                int64
age                    int64
ip_address           float64
class                  int64
country                int64
transaction_delay    float64
hour_of_day            int64
day_of_week            int64
dtype: object
✅ Data Preparation Complete! Processed file saved as 'cleaned_fraud_data.csv'.


In [41]:
# Load raw dataset
fraud_data_path = "../data/Fraud_Data.csv"
creditcard_data_path = "../data/creditcard.csv"

fraud_data = pd.read_csv(fraud_data_path)
creditcard_data = pd.read_csv(creditcard_data_path)

# 📌 Separate Features (X) and Target (y)
X_fraud = fraud_data.drop(columns=["class"])
y_fraud = fraud_data["class"]

X_creditcard = creditcard_data.drop(columns=["Class"])
y_creditcard = creditcard_data["Class"]

In [22]:
# Load raw dataset
fraud_data_path = "../data/processed_fraud_data.csv"
creditcard_data_path = "../data/creditcard.csv"

fraud_data = pd.read_csv(fraud_data_path)
creditcard_data = pd.read_csv(creditcard_data_path)

# 📌 Separate Features (X) and Target (y)
X_fraud = fraud_data.drop(columns=["class"])
y_fraud = fraud_data["class"]

X_creditcard = creditcard_data.drop(columns=["Class"])
y_creditcard = creditcard_data["Class"]

In [42]:
# 📌 Perform Train-Test Split
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(
    X_creditcard, y_creditcard, test_size=0.2, random_state=42, stratify=y_creditcard
)

In [None]:
# 📌 Encode Categorical Variables Using OrdinalEncoder
categorical_columns = ["browser", "source", "country"]
ordinal_encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)

X_fraud_train[categorical_columns] = ordinal_encoder.fit_transform(X_fraud_train[categorical_columns])
X_fraud_test[categorical_columns] = ordinal_encoder.transform(X_fraud_test[categorical_columns])


In [34]:
# 📌 Normalize Numerical Features
scaler = MinMaxScaler()
numeric_columns = X_fraud_train.select_dtypes(include=["number"]).columns

X_fraud_train_scaled = pd.DataFrame(scaler.fit_transform(X_fraud_train[numeric_columns]), columns=numeric_columns)
X_fraud_test_scaled = pd.DataFrame(scaler.transform(X_fraud_test[numeric_columns]), columns=numeric_columns)


In [35]:
# Ensure categorical features are added back after scaling
for col in categorical_columns:
    X_fraud_train_scaled[col] = X_fraud_train[col].values
    X_fraud_test_scaled[col] = X_fraud_test[col].values

In [None]:
# 📌 Save Processed Train-Test Datasets
X_fraud_train_scaled.to_csv(os.path.join(data_path, "X_fraud_train.csv"), index=False)
X_fraud_test_scaled.to_csv(os.path.join(data_path, "X_fraud_test.csv"), index=False)
y_fraud_train.to_csv(os.path.join(data_path, "y_fraud_train.csv"), index=False)
y_fraud_test.to_csv(os.path.join(data_path, "y_fraud_test.csv"), index=False)

print(" Data Preparation Completed: Processed files saved in '/data/processed/'")

In [37]:
# 📌 Define Models for Fraud Detection
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500),
}


In [None]:
# 📌 Train and Evaluate Models
model_results = []

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train model
    model.fit(X_fraud_train, y_fraud_train)
    y_pred = model.predict(X_fraud_test)
    
    # Compute evaluation metrics
    accuracy = accuracy_score(y_fraud_test, y_pred)
    precision = precision_score(y_fraud_test, y_pred)
    recall = recall_score(y_fraud_test, y_pred)
    f1 = f1_score(y_fraud_test, y_pred)
    roc_auc = roc_auc_score(y_fraud_test, y_pred)

    # Store results
    model_results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC-AUC": roc_auc
    })

    # Save the trained model
    model_filename = f"{name.replace(' ', '_')}.pkl"
    joblib.dump(model, os.path.join(models_path, model_filename))