In [1]:
# Import your modules
import sys
import pandas as pd
sys.path.append('../src') # Add src to path

# autoreload to reload modules when they change
%load_ext autoreload
%autoreload 2

from preprocessing import FraudPreprocessor
from modeling import FraudModeler

In [2]:
# 1. Load Data
fraud_data = pd.read_csv('../data/raw/Fraud_Data.csv')
ip_data = pd.read_csv('../data/raw/IpAddress_to_Country.csv')

# 2. Instantiate Preprocessor
processor = FraudPreprocessor(fraud_data, ip_data)

In [3]:
# 3. Run Cleaning & Geolocation
processor.clean_data()
processor.merge_geolocation()

# 4. Feature Engineering
processor.feature_engineering()

# 5. Transform (Scale/Encode)
X, y, feature_names = processor.transform_data()

Converting IPs to integers...
Merging geolocation data...


In [4]:
# 6. Instantiate Modeler
modeler = FraudModeler(X, y)

# 7. Split Data
modeler.split_data()

# 8. Handle Imbalance (SMOTE)
modeler.handle_imbalance()

Training set shape: (120889, 15)
Test set shape: (30223, 15)
Class distribution before SMOTE: [109568  11321]
Class distribution after SMOTE: [109568 109568]


In [5]:
# 9. Train & Evaluate Baseline
modeler.train_baseline()
modeler.evaluate_model('LogisticRegression')

# 10. Train & Evaluate Ensemble (Random Forest)
modeler.train_random_forest()
modeler.evaluate_model('RandomForest')
modeler.cross_validate('RandomForest')

Training Logistic Regression Baseline...
--- LogisticRegression Evaluation ---
              precision    recall  f1-score   support

           0       0.97      0.94      0.95     27393
           1       0.55      0.69      0.61      2830

    accuracy                           0.92     30223
   macro avg       0.76      0.82      0.78     30223
weighted avg       0.93      0.92      0.92     30223

AUC-PR: 0.6652
ROC-AUC: 0.8397
------------------------------
Training Random Forest...
--- RandomForest Evaluation ---
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     27393
           1       0.60      0.67      0.63      2830

    accuracy                           0.93     30223
   macro avg       0.78      0.81      0.80     30223
weighted avg       0.93      0.93      0.93     30223

AUC-PR: 0.7132
ROC-AUC: 0.8446
------------------------------
--- Cross Validation (5-Fold) for RandomForest ---
Mean F1 Score: 0.8583
Standard Dev

### Model Comparison

In [6]:
results_data = []

# Ensure you have trained the models first
# modeler.train_baseline()
# modeler.tune_random_forest() 

for name in modeler.models.keys():
    metrics = modeler.get_metrics(name) 
    results_data.append({
        'Model': name,
        'F1-Score': round(metrics['f1'], 4),
        'AUC-PR': round(metrics['auc_pr'], 4),
        'ROC-AUC': round(metrics['roc_auc'], 4)
    })

comparison_df = pd.DataFrame(results_data)
display(comparison_df)

Unnamed: 0,Model,F1-Score,AUC-PR,ROC-AUC
0,LogisticRegression,0.61,0.6652,0.8397
1,RandomForest,0.6314,0.7132,0.8446
