In [None]:
# 03_Model_Training_Evaluation.ipynb

# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scripts.model_utils import load_data, balance_data, train_models, evaluate_model
from sklearn.model_selection import train_test_split

# Step 1: Load and prepare model-ready data
X, y = load_data(file_path='../data/model_ready.csv', label_col='PotentialFraud')

# Step 2: Handle class imbalance using SMOTE
X_res, y_res = balance_data(X, y)

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Step 4: Train models
models = train_models(X_train, y_train)

# Step 5: Evaluate models and collect results
all_scores = {}
for model_name, model in models.items():
    print(f"\n{'='*40}")
    print(f"Evaluating: {model_name}")
    results = evaluate_model(model, X_test, y_test, model_name=model_name)
    all_scores[model_name] = results

# Step 6: Create a summary DataFrame
summary_df = pd.DataFrame(all_scores).T
summary_df = summary_df[['accuracy', 'precision', 'recall', 'f1', 'auc', 'specificity']]
print("\nModel Evaluation Summary:")
display(summary_df)

# Step 7: Save summary to outputs
summary_df.to_csv("../outputs/model_evaluation_summary.csv", index=True)

# Step 8: Visualize the comparison
plt.figure(figsize=(10, 6))
summary_df[['accuracy', 'precision', 'recall', 'f1']].plot(kind='bar')
plt.title("Model Performance Comparison")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("../outputs/model_performance_comparison.png")
plt.show()

print("Model training and evaluation completed.")
