In [2]:
# 📌 Step 2: Model Training - Fraud Detection

# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [6]:

# Define paths based on project structure
data_path = "../data/processed/"
models_path = "../models/"
reports_path = "../reports/"

In [7]:

# Ensure directories exist
os.makedirs(models_path, exist_ok=True)
os.makedirs(reports_path, exist_ok=True)

# 📌 Load Processed Fraud Data
fraud_data_path = os.path.join(data_path, "processed_fraud_data.csv")
fraud_data = pd.read_csv(fraud_data_path)


In [9]:

# 📌 Drop Non-Numeric Columns
drop_columns = ["user_id", "device_id", "sex"]  # Remove unnecessary text columns
fraud_data = fraud_data.drop(columns=drop_columns)

# 📌 Convert Datetime Columns to Unix Timestamps
datetime_columns = ["signup_time", "purchase_time"]
for col in datetime_columns:
    fraud_data[col] = pd.to_datetime(fraud_data[col], errors="coerce").astype(int) // 10**9

# 📌 Define Features (X) and Target (y)
feature_columns = ["signup_time", "purchase_time", "purchase_value", "source", "browser", "age",
                   "ip_address", "country", "transaction_delay", "hour_of_day", "day_of_week"]
X_fraud = fraud_data[feature_columns]  # Features
y_fraud = fraud_data["class"]  # Target

In [11]:

# 📌 Train-Test Split
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# 📌 Verify Data Types (Ensure all columns are numeric)
print("Feature Data Types:\n", X_fraud_train.dtypes)

# 📌 Define Models for Fraud Detection
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500),
}

Feature Data Types:
 signup_time            int64
purchase_time          int64
purchase_value       float64
source                 int64
browser                int64
age                    int64
ip_address           float64
country                int64
transaction_delay    float64
hour_of_day            int64
day_of_week            int64
dtype: object


In [13]:
# 📌 Train and Evaluate Models
model_results = []

for name, model in models.items():
    print(f"Training {name}...")
    
    # Train model
    model.fit(X_fraud_train, y_fraud_train)
    y_pred = model.predict(X_fraud_test)
    
    # Compute evaluation metrics
    accuracy = accuracy_score(y_fraud_test, y_pred)
    precision = precision_score(y_fraud_test, y_pred)
    recall = recall_score(y_fraud_test, y_pred)
    f1 = f1_score(y_fraud_test, y_pred)
    roc_auc = roc_auc_score(y_fraud_test, y_pred)

    # Store results
    model_results.append({
        "Model": name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC-AUC": roc_auc
    })

    # Save the trained model
    model_filename = f"{name.replace(' ', '_')}.pkl"
    joblib.dump(model, os.path.join(models_path, model_filename))


Training Logistic Regression...


  _warn_prf(average, modifier, msg_start, len(result))


Training Decision Tree...
Training Random Forest...
Training Gradient Boosting...
Training MLP Classifier...


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# 📌 Convert results to DataFrame
results_df = pd.DataFrame(model_results)

# Save evaluation results
results_file = os.path.join(reports_path, "model_performance.csv")
results_df.to_csv(results_file, index=False)

# Display results using Pandas
print("\n✅ Model Training Complete!")
print("📂 Trained models saved in:", models_path)
print("📂 Evaluation report saved in:", results_file)
print("📊 Model Performance Results:\n", results_df)




✅ Model Training Complete!
📂 Trained models saved in: ../models/
📂 Evaluation report saved in: ../reports/model_performance.csv
📊 Model Performance Results:
                  Model  Accuracy  Precision    Recall  F1 Score   ROC-AUC
0  Logistic Regression  0.906363   0.000000  0.000000  0.000000  0.500000
1        Decision Tree  0.907885   0.507367  0.559717  0.532258  0.751786
2        Random Forest  0.955696   1.000000  0.526855  0.690118  0.763428
3    Gradient Boosting  0.955696   1.000000  0.526855  0.690118  0.763428
4       MLP Classifier  0.906363   0.000000  0.000000  0.000000  0.500000


In [16]:
# 📌 Save the feature names used during training
feature_file = os.path.join(models_path, "trained_features.txt")

with open(feature_file, "w") as f:
    for feature in X_fraud_train.columns:
        f.write(feature + "\n")

print("✅ Feature list saved in trained_features.txt")


✅ Feature list saved in trained_features.txt
