In [None]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score,
    f1_score, precision_score, recall_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import time

# ===== 1. LOAD DATA =====
print("Loading data...")
X_train = pd.read_csv('X_train_fixed.csv')
y_train = pd.read_csv('y_train_fixed.csv').values.flatten()
X_val = pd.read_csv('X_val_fixed.csv')
y_val = pd.read_csv('y_val_fixed.csv').values.flatten()
X_test = pd.read_csv('X_test_fixed.csv')
y_test = pd.read_csv('y_test_fixed.csv').values.flatten()

print(f"Train: {X_train.shape[0]:,} samples ({y_train.mean()*100:.2f}% fraud)")
print(f"Val:   {X_val.shape[0]:,} samples ({y_val.mean()*100:.2f}% fraud)")
print(f"Test:  {X_test.shape[0]:,} samples ({y_test.mean()*100:.2f}% fraud)")

# ===== 2. CALCULATE CLASS WEIGHT =====
fraud_count = y_train.sum()
non_fraud_count = len(y_train) - fraud_count
scale_pos_weight = non_fraud_count / fraud_count
print(f"\nClass imbalance ratio: {scale_pos_weight:.2f}")

# ===== 3. CONFIGURE XGBOOST WITH CUDA =====
print("\nConfiguring XGBoost with CUDA...")

params = {
    # GPU settings
    'device': 'cuda',
    'tree_method': 'hist',

    # Model parameters
    'objective': 'binary:logistic',
    'eval_metric': ['auc', 'aucpr', 'logloss'],
    'scale_pos_weight': scale_pos_weight,

    # Tree parameters
    'max_depth': 6,
    'min_child_weight': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,

    # Regularization
    'reg_alpha': 0.1,      # L1
    'reg_lambda': 1.0,     # L2

    # Learning
    'learning_rate': 0.1,
    'n_estimators': 500,

    # Reproducibility
    'random_state': 42,
}

print("Parameters configured:")
for key, value in params.items():
    print(f"  {key}: {value}")

# ===== 4. CREATE DMatrix (XGBoost's optimized data structure) =====
print("\nCreating DMatrix for GPU...")
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

# ===== 5. TRAIN MODEL =====
print("\nTraining XGBoost model on GPU...")
start_time = time.time()

# Training with early stopping
evals = [(dtrain, 'train'), (dval, 'val')]

model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=30,
    verbose_eval=25  # Print every 25 rounds
)

training_time = time.time() - start_time
print(f"\n✓ Training complete in {training_time:.1f} seconds")
print(f"Best iteration: {model.best_iteration}")

# ===== 6. GET PREDICTIONS =====
print("\nGenerating predictions...")
y_probs = model.predict(dtest)
print(f"Probability range: {y_probs.min():.4f} to {y_probs.max():.4f}")
print(f"Probability mean: {y_probs.mean():.4f}")
print(f"Probability std: {y_probs.std():.4f}")

# ===== 7. THRESHOLD OPTIMIZATION =====
print("\n" + "="*70)
print("THRESHOLD OPTIMIZATION")
print("="*70)

thresholds = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

results = []
for thresh in thresholds:
    preds = (y_probs >= thresh).astype(int)

    tp = ((preds == 1) & (y_test == 1)).sum()
    fp = ((preds == 1) & (y_test == 0)).sum()
    tn = ((preds == 0) & (y_test == 0)).sum()
    fn = ((preds == 0) & (y_test == 1)).sum()

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    results.append({
        'threshold': thresh,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'predicted_frauds': preds.sum(),
        'true_positives': tp,
        'false_positives': fp,
        'false_negatives': fn
    })

print(f"\n{'Thresh':<8} {'Precision':<12} {'Recall':<10} {'F1':<10} {'Pred Frauds':<14} {'TP':<10} {'FP':<12} {'FN':<10}")
print("-" * 100)
for r in results:
    print(f"{r['threshold']:<8.2f} {r['precision']:<12.4f} {r['recall']:<10.4f} {r['f1']:<10.4f} {r['predicted_frauds']:<14,} {r['true_positives']:<10,} {r['false_positives']:<12,} {r['false_negatives']:<10,}")

# Find optimal threshold
best_result = max(results, key=lambda x: x['f1'])
print(f"\n✓ Optimal Threshold (Max F1): {best_result['threshold']}")
print(f"  Precision: {best_result['precision']:.4f}")
print(f"  Recall: {best_result['recall']:.4f}")
print(f"  F1: {best_result['f1']:.4f}")
print(f"  False Positives: {best_result['false_positives']:,}")

# ===== 8. DETAILED EVALUATION =====
print("\n" + "="*70)
print(f"DETAILED EVALUATION AT THRESHOLD = {best_result['threshold']}")
print("="*70)

optimal_preds = (y_probs >= best_result['threshold']).astype(int)

print("\nClassification Report:")
print(classification_report(y_test, optimal_preds, target_names=['Non-Fraud', 'Fraud'], digits=4))

roc_auc = roc_auc_score(y_test, y_probs)
pr_auc = average_precision_score(y_test, y_probs)
print(f"ROC-AUC Score: {roc_auc:.4f}")
print(f"PR-AUC Score: {pr_auc:.4f}")

# ===== 9. FEATURE IMPORTANCE =====
print("\n" + "="*70)
print("FEATURE IMPORTANCE")
print("="*70)

importance = model.get_score(importance_type='gain')
importance_df = pd.DataFrame({
    'feature': importance.keys(),
    'importance': importance.values()
}).sort_values('importance', ascending=False)

print("\nTop 10 Features:")
for idx, row in importance_df.head(10).iterrows():
    print(f"  {row['feature']}: {row['importance']:.2f}")

# ===== 10. VISUALIZATIONS =====
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot 1: Confusion Matrix
cm = confusion_matrix(y_test, optimal_preds)
sns.heatmap(cm, annot=True, fmt=',d', cmap='Blues', ax=axes[0, 0],
            xticklabels=['Non-Fraud', 'Fraud'],
            yticklabels=['Non-Fraud', 'Fraud'])
axes[0, 0].set_ylabel('Actual')
axes[0, 0].set_xlabel('Predicted')
axes[0, 0].set_title(f'Confusion Matrix (threshold={best_result["threshold"]})', fontweight='bold')

# Plot 2: ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_probs)
axes[0, 1].plot(fpr, tpr, linewidth=2, label=f'XGBoost (AUC={roc_auc:.3f})')
axes[0, 1].plot([0, 1], [0, 1], 'k--', label='Random')
axes[0, 1].set_xlabel('False Positive Rate')
axes[0, 1].set_ylabel('True Positive Rate')
axes[0, 1].set_title('ROC Curve', fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Plot 3: Precision-Recall Curve
precisions_curve, recalls_curve, _ = precision_recall_curve(y_test, y_probs)
axes[0, 2].plot(recalls_curve, precisions_curve, linewidth=2, label=f'XGBoost (AP={pr_auc:.3f})')
axes[0, 2].axhline(y=y_test.mean(), color='k', linestyle='--', label=f'Baseline ({y_test.mean():.3f})')
axes[0, 2].set_xlabel('Recall')
axes[0, 2].set_ylabel('Precision')
axes[0, 2].set_title('Precision-Recall Curve', fontweight='bold')
axes[0, 2].legend()
axes[0, 2].grid(alpha=0.3)

# Plot 4: Threshold vs Metrics
thresholds_plot = [r['threshold'] for r in results]
precisions_plot = [r['precision'] for r in results]
recalls_plot = [r['recall'] for r in results]
f1s_plot = [r['f1'] for r in results]

axes[1, 0].plot(thresholds_plot, precisions_plot, 'b-o', label='Precision', linewidth=2)
axes[1, 0].plot(thresholds_plot, recalls_plot, 'g-s', label='Recall', linewidth=2)
axes[1, 0].plot(thresholds_plot, f1s_plot, 'r-^', label='F1', linewidth=2)
axes[1, 0].axvline(x=best_result['threshold'], color='black', linestyle='--', label=f'Optimal ({best_result["threshold"]})')
axes[1, 0].set_xlabel('Threshold')
axes[1, 0].set_ylabel('Score')
axes[1, 0].set_title('Metrics vs Threshold', fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# Plot 5: Feature Importance
top_features = importance_df.head(10)
axes[1, 1].barh(top_features['feature'], top_features['importance'], color='steelblue')
axes[1, 1].set_xlabel('Importance (Gain)')
axes[1, 1].set_title('Top 10 Feature Importance', fontweight='bold')
axes[1, 1].invert_yaxis()

# Plot 6: Probability Distribution
axes[1, 2].hist(y_probs[y_test == 0], bins=50, alpha=0.7, label='Non-Fraud', color='#2ecc71', density=True)
axes[1, 2].hist(y_probs[y_test == 1], bins=50, alpha=0.7, label='Fraud', color='#e74c3c', density=True)
axes[1, 2].axvline(x=best_result['threshold'], color='black', linestyle='--', label=f'Threshold ({best_result["threshold"]})')
axes[1, 2].set_xlabel('Predicted Probability')
axes[1, 2].set_ylabel('Density')
axes[1, 2].set_title('Probability Distribution by Class', fontweight='bold')
axes[1, 2].legend()

plt.tight_layout()
plt.savefig('outputs/xgboost_evaluation.png', dpi=300, bbox_inches='tight')
plt.show()

# ===== 11. SAVE MODEL =====
model.save_model('models/xgboost_fraud_model.json')
print("\n✓ Model saved: models/xgboost_fraud_model.json")
print("✓ Visualization saved: outputs/xgboost_evaluation.png")

# ===== 12. COMPARISON SUMMARY =====
print("\n" + "="*70)
print("MODEL COMPARISON: Neural Network vs XGBoost")
print("="*70)
print(f"""
                    Neural Network    XGBoost
                    --------------    -------
ROC-AUC:            0.5868            {roc_auc:.4f}
PR-AUC:             0.0434            {pr_auc:.4f}
Best F1:            0.0832            {best_result['f1']:.4f}
Precision:          4.37%             {best_result['precision']*100:.2f}%
Recall:             86.32%            {best_result['recall']*100:.2f}%
False Positives:    508,936           {best_result['false_positives']:,}
""")

In [1]:
import pandas as pd
# Faster: Read only first 100K rows for quick check
df_sample = pd.read_csv('financial_fraud_detection_dataset.csv', nrows=100000)
print("All columns:", df_sample.columns.tolist())
print("\nSample fraud vs non-fraud comparison:")
print(df_sample.groupby('is_fraud').mean(numeric_only=True))

All columns: ['transaction_id', 'timestamp', 'sender_account', 'receiver_account', 'amount', 'transaction_type', 'merchant_category', 'location', 'device_used', 'is_fraud', 'fraud_type', 'time_since_last_transaction', 'spending_deviation_score', 'velocity_score', 'geo_anomaly_score', 'payment_channel', 'ip_address', 'device_hash']

Sample fraud vs non-fraud comparison:
              amount  time_since_last_transaction  spending_deviation_score  \
is_fraud                                                                      
False     357.420245                    -4.924667                  0.002159   
True      397.088824                  -139.688917                 -0.134748   

          velocity_score  geo_anomaly_score  
is_fraud                                     
False          10.465738           0.500260  
True           10.151261           0.522773  
