In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, f1_score, auc)
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X-test.csv')
y_train = pd.read_csv('../data/processed/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/processed/y_test.csv').values.ravel()

In [6]:
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (398040, 30), Test shape: (85443, 30)


In [7]:
models = {
    'Logistic_Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random_Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss')
}

In [8]:
results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print(f"{'='*60}")
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    auc_score = roc_auc_score(y_test, y_pred_proba)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Legitimate', 'Fraud']))
    print(f"\nAUC-ROC Score: {auc_score:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    results[name] = {
        'model': model,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'auc': auc_score,
        'f1': f1
    }


Training Logistic_Regression...

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      0.98      0.99     85295
       Fraud       0.06      0.88      0.12       148

    accuracy                           0.98     85443
   macro avg       0.53      0.93      0.55     85443
weighted avg       1.00      0.98      0.99     85443


AUC-ROC Score: 0.9672
F1 Score: 0.1202

Training Random_Forest...

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     85295
       Fraud       0.89      0.78      0.83       148

    accuracy                           1.00     85443
   macro avg       0.95      0.89      0.92     85443
weighted avg       1.00      1.00      1.00     85443


AUC-ROC Score: 0.9690
F1 Score: 0.8345

Training XGBoost...

Classification Report:
              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     85295
    

In [10]:
print(f"\n{'='*60}")
print("MODEL COMPARISON")
print(f"{'='*60}")
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'AUC-ROC': [results[m]['auc'] for m in results.keys()],
    'F1': [results[m]['f1'] for m in results.keys()]
})
print(comparison_df.to_string(index=False))


MODEL COMPARISON
              Model  AUC-ROC       F1
Logistic_Regression 0.967180 0.120203
      Random_Forest 0.968996 0.834532
            XGBoost 0.975061 0.768254


In [12]:
best_model_name = max(results.keys(), key=lambda x: results[x]['auc'])
best_model = results[best_model_name]['model']
best_y_pred = results[best_model_name]['y_pred']
best_y_pred_proba = results[best_model_name]['y_pred_proba']
print(f"\n Best Model: {best_model_name}")


 Best Model: XGBoost


In [13]:
print(f"\n{'='*60}")
print("DETAILED EVALUATION: {best_model_name}")
print(f"{'='*60}")

cm = confusion_matrix(y_test, best_y_pred)
print(f"\nConfusion Matrix:\n{cm}")
print(f"TN={cm[0,0]}, FP={cm[0,1]}, FN={cm[1,0]}, TP={cm[1,1]}")


DETAILED EVALUATION: {best_model_name}

Confusion Matrix:
[[85249    46]
 [   27   121]]
TN=85249, FP=46, FN=27, TP=121


In [14]:
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Legitimate', 'Fraud'],
            yticklabels=['Legitimate', 'Fraud'])
plt.title(f'{best_model_name} - Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.savefig(f'../results/plots/confusion_matrix.png', dpi=100, bbox_inches='tight')
plt.close()

In [16]:
fpr, tpr, thresholds = roc_curve(y_test, best_y_pred_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'{best_model_name} - ROC Curve')
plt.legend(loc='lower right')
plt.savefig(f'../results/plots/roc_curve.png', dpi=100, bbox_inches='tight')
plt.close()

In [17]:
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    top_features = feature_importance.head(15)
    plt.barh(top_features['Feature'], top_features['Importance'])
    plt.xlabel('Importance')
    plt.title(f'{best_model_name} - Top 15 Feature Importances')
    plt.tight_layout()
    plt.savefig(f'../results/plots/feature_importance.png', dpi=100, bbox_inches='tight')
    plt.close()
    
    print(f'\nTop 10 Feautres:')
    print(feature_importance.head(10).to_string(index=False))


Top 10 Feautres:
Feature  Importance
    V14    0.631970
     V4    0.060161
     V8    0.024163
    V10    0.023101
    V23    0.017659
    V12    0.017013
    V25    0.015149
    V17    0.014496
    V11    0.014002
    V21    0.013021


In [18]:
import joblib
joblib.dump(best_model, '../models/best_model.pkl')

['../models/best_model.pkl']

In [19]:
import json
metrics = {
    'best_model': best_model_name,
    'auc_roc': results[best_model_name]['auc'],
    'f1_score': results[best_model_name]['f1'],
    'test_set_size': len(y_test),
    'fraud_cases_in_test': int(y_test.sum()),
    'true_positives': int(cm[1,1]),
    'false_positives': int(cm[0,1]),
    'false_negatives': int(cm[1,0]),
    'true_negatives': int(cm[0,0])
}

with open('../results/metrics.json', 'w') as f:
    json.dump(metrics, f, indent=4)