# Model Building

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import xgboost as xgb
import pickle
import json
import os

import warnings
warnings.filterwarnings("ignore")

In [3]:
# data Loading & preparing
df = pd.read_parquet('../data/processed/creditcard_2023_processed.parquet')
x = df.drop(['id','Class'], axis=1)
y = df.Class

In [19]:
def save_model(model, model_name, folder_path='../models'):
    # Ensure the folder path exists
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Construct the file path
    file_path = os.path.join(folder_path, f"{model_name}.pkl")
    
    # Save the model to the file
    with open(file_path, 'wb') as file:
        pickle.dump(model, file)
    
    print(f"Model saved to {file_path}")

def save_metrics(metrics, model_name, folder_path='../models'):
    # Ensure the folder path exists
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Construct the file path
    file_path = os.path.join(folder_path, f"{model_name}_metrics.json")
    
    # Save the metrics to the JSON file
    with open(file_path, 'w') as file:
        json.dump(metrics, file, indent=4)
    
    print(f"Metrics saved to {file_path}")

def evaluate_model(model_name, model, params, X_train, y_train, X_val, y_val, n_iter=10):
    """Train and evaluate a model with hyperparameter tuning using RandomizedSearchCV."""
    random_search = RandomizedSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1, n_iter=n_iter, random_state=42)
    random_search.fit(X_train, y_train)
    
    best_model = random_search.best_estimator_
    y_val_pred = best_model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_val_pred)
    precision = precision_score(y_val, y_val_pred, average='weighted')
    recall = recall_score(y_val, y_val_pred, average='weighted')
    f1 = f1_score(y_val, y_val_pred, average='weighted')
    
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }
    
    print(f'\n------- {model_name} Validation Evaluation ---------')
    print(f'Best parameters: {random_search.best_params_}')
    print(f'Model Accuracy: {accuracy:.4f}')
    print('Confusion Matrix:\n', confusion_matrix(y_val, y_val_pred))
    print('Classification Report:\n', classification_report(y_val, y_val_pred))
    
    # Save the model and metrics
    save_model(best_model, model_name.replace(" ", "_").lower())
    save_metrics(metrics, model_name.replace(" ", "_").lower())
    
    return best_model, metrics

# preparing data for training, testing and validation
X_train, y_train, X_val, y_val, X_test, y_test = prepare_data(x, y)

# models
models = [
    ("Logistic Regression", LogisticRegression(), {'C': [0.1, 1, 10]}),
    ("Decision Tree", DecisionTreeClassifier(), {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}),
    ("Random Forest", RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    }),
    ("XGBoost", xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'), {
        'max_depth': [3, 5, 7],
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2]
    }),
    ("Support Vector Machine", SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    })
]

# train, evaluate, and save each model
for model_name, model, params in models:
    best_model, metrics = evaluate_model(model_name, model, params, X_train, y_train, X_val, y_val)

print("\nAll models trained, evaluated, and saved.")


------- Logistic Regression Validation Evaluation ---------
Best parameters: {'C': 1}
Model Accuracy: 0.9635
Confusion Matrix:
 [[27867   564]
 [ 1514 26918]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.96     28431
           1       0.98      0.95      0.96     28432

    accuracy                           0.96     56863
   macro avg       0.96      0.96      0.96     56863
weighted avg       0.96      0.96      0.96     56863

Model saved to ../models\logistic_regression.pkl
Metrics saved to ../models\logistic_regression_metrics.json

------- Decision Tree Validation Evaluation ---------
Best parameters: {'min_samples_split': 2, 'max_depth': None}
Model Accuracy: 0.9695
Confusion Matrix:
 [[27461   970]
 [  763 27669]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.97      0.97     28431
           1       0.97      0.97      0.97     28432

   

# Based on the results:

All models performed well, with accuracies ranging from 96.35% to 99.25%.
XGBoost achieved the highest accuracy (99.25%), followed closely by SVM (99.16%) and Random Forest (98.57%).
Decision Tree (97.03%) and Logistic Regression (96.35%) performed adequately but were outperformed by the more complex models.
All models showed good balance between precision and recall for both classes.

Taken together, XGBoost appears to be the most effective for this credit card fraud detection task, but model choice should also consider factors like interpretability and computational resources. For optimal fraud detection, XGBoost or SVM would be recommended based on these results.