In [24]:
# %% [markdown]
# # Task 2: Model Building - WORKING VERSION
# 
# This notebook works with your file location.

# %%
import pandas as pd
import numpy as np
import os

# Check where the file is
print("Current directory:", os.getcwd())
print("\nFiles in data/ directory:")
if os.path.exists('data'):
    for item in os.listdir('data'):
        item_path = os.path.join('data', item)
        if os.path.isdir(item_path):
            print(f"üìÅ {item}/")
            for file in os.listdir(item_path):
                print(f"    {file}")
        else:
            print(f"üìÑ {item}")

# %%
# Load the data from the correct location
data_path = 'data/Fraud_Data.csv'  # Your actual location
print(f"\nLoading data from: {data_path}")

df = pd.read_csv("../data/raw/Fraud_Data.csv")
print(f"Data loaded: {df.shape}")

# %%
# Basic preprocessing
if 'signup_time' in df.columns:
    df['signup_time'] = pd.to_datetime(df['signup_time'])
if 'purchase_time' in df.columns:
    df['purchase_time'] = pd.to_datetime(df['purchase_time'])
    df['purchase_hour'] = df['purchase_time'].dt.hour
    if 'signup_time' in df.columns:
        df['hours_since_signup'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 3600
        df['quick_purchase'] = (df['hours_since_signup'] < 1).astype(int)

# Select features
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
features = [col for col in numeric_cols if col not in ['class', 'user_id']]

X = df[features].fillna(0)
y = df['class']

print(f"Features: {features}")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Fraud rate: {y.mean():.4%}")

# %%
# Now run Task 2 models
import sys
sys.path.append('..')

from src.task2_models import Task2Models

trainer = Task2Models(random_state=42)

# We already loaded X, y, so we can skip trainer.load_data()
# Just use the prepare_data method
X_train, X_test, y_train, y_test = trainer.prepare_data(X, y)

print(f"\nTrain set: {X_train.shape}, fraud rate: {y_train.mean():.4%}")
print(f"Test set: {X_test.shape}, fraud rate: {y_test.mean():.4%}")

# %%
# 1. Baseline model
print("\n" + "="*60)
print("1. BASELINE MODEL - LOGISTIC REGRESSION")
print("="*60)

baseline_results = trainer.train_baseline(X_train, y_train, X_test, y_test)

# %%
# 2. Ensemble model
print("\n" + "="*60)
print("2. ENSEMBLE MODEL - RANDOM FOREST")
print("="*60)

ensemble_results = trainer.train_ensemble(X_train, y_train, X_test, y_test, 'random_forest')

# %%
# 3. Cross-validation
print("\n" + "="*60)
print("3. 5-FOLD CROSS-VALIDATION")
print("="*60)

for model_key, model in trainer.models.items():
    model_name = trainer.results[model_key]['model_name']
    cv_results = trainer.cross_validate(model, X, y, model_name)

# %%
# 4. Model comparison and selection
print("\n" + "="*60)
print("4. MODEL COMPARISON AND SELECTION")
print("="*60)

best_model, best_name = trainer.compare_and_select()

# %%
# 5. Save best model
print("\n" + "="*60)
print("5. SAVE MODEL FOR TASK 3")
print("="*60)

if best_model:
    saved_path = trainer.save_model(best_model, best_name)
    print(f"\n‚úÖ Best model saved to: {saved_path}")
    print("Ready for Task 3: Model Explainability")

# %%
print("\n" + "="*60)
print("‚úÖ TASK 2 COMPLETED")
print("="*60)

Current directory: c:\Users\It's Blue\fraud-detection\notebooks

Files in data/ directory:

Loading data from: data/Fraud_Data.csv
Data loaded: (151112, 11)
Features: ['purchase_value', 'age', 'ip_address', 'purchase_hour', 'hours_since_signup', 'quick_purchase']
X shape: (151112, 6)
y shape: (151112,)
Fraud rate: 9.3646%

Train set: (120889, 6), fraud rate: 9.3648%
Test set: (30223, 6), fraud rate: 9.3637%

1. BASELINE MODEL - LOGISTIC REGRESSION
BASELINE: Logistic Regression

Results for Logistic Regression:
  PR-AUC: 0.2715
  ROC-AUC: 0.7117
  F1-Score: 0.3035

  Confusion Matrix:
    TN: 19917, FP: 7476
    FN: 986, TP: 1844

2. ENSEMBLE MODEL - RANDOM FOREST

ENSEMBLE: RANDOM_FOREST
Performing basic hyperparameter check...
  Selected max_depth=5 (PR-AUC: 0.6194)

Results for Random Forest:
  PR-AUC: 0.6194
  ROC-AUC: 0.7644
  F1-Score: 0.6900

  Confusion Matrix:
    TN: 27392, FP: 1
    FN: 1339, TP: 1491

3. 5-FOLD CROSS-VALIDATION

5-Fold Cross-Validation for Logistic Regressio