In [1]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    precision_recall_curve, average_precision_score,
    roc_auc_score, f1_score
)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

# Set display and plotting options
pd.set_option('display.max_columns', 100)
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

In [2]:
# Load processed data
fraud_data = pd.read_csv('../../data/processed/fraud_data_with_features.csv')
credit_data = pd.read_csv('../../data/processed/creditcard_clean.csv')

print('Fraud_Data shape:', fraud_data.shape)
print('Credit card data shape:', credit_data.shape)

Fraud_Data shape: (129146, 27)
Credit card data shape: (283726, 31)


In [3]:
# Fraud_Data features
fraud_feature_columns = [
    'purchase_value', 'age', 'time_since_signup', 'hour_of_day', 'day_of_week',
    'user_transaction_count', 'device_usage_count', 'ip_usage_count', 'country_transaction_count',
    'source_encoded', 'browser_encoded', 'sex_encoded', 'country_encoded'
]
fraud_data_clean = fraud_data.dropna(subset=fraud_feature_columns + ['class'])
X_fraud = fraud_data_clean[fraud_feature_columns]
y_fraud = fraud_data_clean['class']
print('Fraud_Data features shape:', X_fraud.shape)
print('Fraud_Data target shape:', y_fraud.shape)
print('Fraud_Data class distribution:')
print(y_fraud.value_counts(normalize=True))

# Credit card features
credit_feature_columns = [col for col in credit_data.columns if col not in ['Time', 'Class']]
X_credit = credit_data[credit_feature_columns]
y_credit = credit_data['Class']
print('Credit card features shape:', X_credit.shape)
print('Credit card target shape:', y_credit.shape)
print('Credit card class distribution:')
print(y_credit.value_counts(normalize=True))

Fraud_Data features shape: (129146, 13)
Fraud_Data target shape: (129146,)
Fraud_Data class distribution:
class
0    0.905007
1    0.094993
Name: proportion, dtype: float64
Credit card features shape: (283726, 29)
Credit card target shape: (283726,)
Credit card class distribution:
Class
0    0.998333
1    0.001667
Name: proportion, dtype: float64


In [4]:
# Fraud_Data
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)
scaler_fraud = StandardScaler()
X_fraud_train_scaled = scaler_fraud.fit_transform(X_fraud_train)
X_fraud_test_scaled = scaler_fraud.transform(X_fraud_test)
smote_fraud = SMOTE(random_state=42)
X_fraud_train_balanced, y_fraud_train_balanced = smote_fraud.fit_resample(X_fraud_train_scaled, y_fraud_train)
print('Fraud_Data - Training set shape:', X_fraud_train_balanced.shape)
print('Fraud_Data - Test set shape:', X_fraud_test_scaled.shape)
print('Fraud_Data - Balanced training class distribution:')
print(pd.Series(y_fraud_train_balanced).value_counts(normalize=True))

# Credit card
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit
)
scaler_credit = StandardScaler()
X_credit_train_scaled = scaler_credit.fit_transform(X_credit_train)
X_credit_test_scaled = scaler_credit.transform(X_credit_test)
smote_credit = SMOTE(random_state=42)
X_credit_train_balanced, y_credit_train_balanced = smote_credit.fit_resample(X_credit_train_scaled, y_credit_train)
print('Credit card - Training set shape:', X_credit_train_balanced.shape)
print('Credit card - Test set shape:', X_credit_test_scaled.shape)
print('Credit card - Balanced training class distribution:')
print(pd.Series(y_credit_train_balanced).value_counts(normalize=True))

Fraud_Data - Training set shape: (187004, 13)
Fraud_Data - Test set shape: (25830, 13)
Fraud_Data - Balanced training class distribution:
class
0    0.5
1    0.5
Name: proportion, dtype: float64
Credit card - Training set shape: (453204, 29)
Credit card - Test set shape: (56746, 29)
Credit card - Balanced training class distribution:
Class
0    0.5
1    0.5
Name: proportion, dtype: float64


In [6]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100)
}

def evaluate_model(model, X_train, y_train, X_test, y_test, model_name, dataset_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    f1 = f1_score(y_test, y_pred)
    auc_pr = average_precision_score(y_test, y_pred_proba)
    cm = confusion_matrix(y_test, y_pred)
    print(f'\n{model_name} on {dataset_name}:')
    print(f'F1-Score: {f1:.4f}')
    print(f'AUC-PR: {auc_pr:.4f}')
    print('Confusion Matrix:')
    print(cm)
    return {'model': model, 'f1_score': f1, 'auc_pr': auc_pr, 'confusion_matrix': cm}

In [7]:
# Train and evaluate models on Fraud_Data
fraud_results = {}
for name, model in models.items():
    results = evaluate_model(
        model, X_fraud_train_balanced, y_fraud_train_balanced, X_fraud_test_scaled, y_fraud_test, name, 'Fraud_Data'
    )
    fraud_results[name] = results


Logistic Regression on Fraud_Data:
F1-Score: 0.6243
AUC-PR: 0.6590
Confusion Matrix:
[[21998  1378]
 [  715  1739]]

Random Forest on Fraud_Data:
F1-Score: 0.6988
AUC-PR: 0.7085
Confusion Matrix:
[[23277    99]
 [ 1083  1371]]


In [8]:
# Train and evaluate models on creditcard data
credit_results = {}
for name, model in models.items():
    results = evaluate_model(
        model, X_credit_train_balanced, y_credit_train_balanced, X_credit_test_scaled, y_credit_test, name, 'creditcard'
    )
    credit_results[name] = results


Logistic Regression on creditcard:
F1-Score: 0.0993
AUC-PR: 0.6763
Confusion Matrix:
[[55158  1493]
 [   12    83]]

Random Forest on creditcard:
F1-Score: 0.8202
AUC-PR: 0.8110
Confusion Matrix:
[[56641    10]
 [   22    73]]


In [9]:
# Compare models
comparison_data = []
for dataset_name, results in [('Fraud_Data', fraud_results), ('creditcard', credit_results)]:
    for model_name, result in results.items():
        comparison_data.append({
            'Dataset': dataset_name,
            'Model': model_name,
            'F1-Score': result['f1_score'],
            'AUC-PR': result['auc_pr']
        })
comparison_df = pd.DataFrame(comparison_data)
print('Model Performance Comparison:')
print(comparison_df)

# Select best model for each dataset
for dataset in ['Fraud_Data', 'creditcard']:
    dataset_results = comparison_df[comparison_df['Dataset'] == dataset]
    best_f1_idx = dataset_results['F1-Score'].idxmax()
    best_auc_idx = dataset_results['AUC-PR'].idxmax()
    print(f'\n{dataset}:')
    print(f'Best F1-Score: {dataset_results.loc[best_f1_idx, "Model"]} ({dataset_results.loc[best_f1_idx, "F1-Score"]:.4f})')
    print(f'Best AUC-PR: {dataset_results.loc[best_auc_idx, "Model"]} ({dataset_results.loc[best_auc_idx, "AUC-PR"]:.4f})')

Model Performance Comparison:
      Dataset                Model  F1-Score    AUC-PR
0  Fraud_Data  Logistic Regression  0.624304  0.658970
1  Fraud_Data        Random Forest  0.698777  0.708472
2  creditcard  Logistic Regression  0.099342  0.676257
3  creditcard        Random Forest  0.820225  0.811033

Fraud_Data:
Best F1-Score: Random Forest (0.6988)
Best AUC-PR: Random Forest (0.7085)

creditcard:
Best F1-Score: Random Forest (0.8202)
Best AUC-PR: Random Forest (0.8110)


In [10]:
# Save best models
best_fraud_model = fraud_results['Random Forest']['model']  # Change if Logistic Regression is better
best_credit_model = credit_results['Random Forest']['model']  # Change if Logistic Regression is better
joblib.dump(best_fraud_model, '../../models/best_fraud_model.pkl')
joblib.dump(best_credit_model, '../../models/best_credit_model.pkl')
print('Best models saved to models/ directory')

Best models saved to models/ directory
