In [1]:
# train_fraud_models.py
import yaml
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix,
    roc_curve, precision_recall_curve
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from pathlib import Path
import sys
# Point to the project root (adjust parents[1] to parents[2] if your notebook is deeper)
PROJECT_ROOT = Path.cwd().resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

# from lib.functions import ()



# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")
    
df_fraud_dataset = pd.read_csv(config['input_data']['file1']) 

In [2]:
# -----------------------
# Configuration
# -----------------------
TARGET_COL = "Fraud_Label"
TIMESTAMP_COL = "Timestamp"  # if present, we’ll extract hour/dow
ARTIFACT_DIR = Path("./artifacts_proto")  # output folder
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR = ARTIFACT_DIR / "models"
PLOTS_DIR = ARTIFACT_DIR / "plots"
MODELS_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

# Max rows to train on (stratified) to keep runs fast.
MAX_ROWS = 3000   # adjust or set to None for full dataset

# A compact but high-signal feature subset (falls back to intersection with your columns)
KEEP_COLS = [
    # numeric
    "Transaction_Amount", "Account_Balance", "IP_Address_Flag",
    "Previous_Fraudulent_Activity", "Daily_Transaction_Count",
    "Avg_Transaction_Amount_7d", "Failed_Transaction_Count_7d",
    "Card_Age", "Transaction_Distance", "Risk_Score", "Is_Weekend",
    # categorical (typical low/med cardinality)
    "Transaction_Type", "Device_Type", "Card_Type", "Authentication_Method"
]

In [3]:
# -----------------------
# Utilities
# -----------------------
def safe_onehot(*, handle_unknown="ignore", prefer_dense=True):
    """
    Return a OneHotEncoder that works across scikit-learn versions.
    - Newer sklearn: OneHotEncoder(sparse_output=False)
    - Older sklearn: OneHotEncoder(sparse=False)
    """
    if prefer_dense:
        try:
            return OneHotEncoder(handle_unknown=handle_unknown, sparse_output=False)
        except TypeError:
            return OneHotEncoder(handle_unknown=handle_unknown, sparse=False)
    return OneHotEncoder(handle_unknown=handle_unknown)

def compute_metrics(y_true, y_pred, y_proba):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall": recall_score(y_true, y_pred, zero_division=0),
        "f1": f1_score(y_true, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_true, y_proba),
        "pr_auc": average_precision_score(y_true, y_proba),
        "n_test": int(len(y_true)),
        "positive_rate_test": float(np.mean(y_true)),
    }

def plot_and_save_cm(cm, title, out_path):
    fig = plt.figure()
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("True")
    for (i, j), z in np.ndenumerate(cm):
        plt.text(j, i, str(z), ha="center", va="center")
    plt.tight_layout()
    plt.savefig(out_path, bbox_inches="tight")
    plt.close(fig)

def plot_and_save_curve(x, y, xlab, ylab, title, out_path):
    fig = plt.figure()
    plt.plot(x, y)
    plt.title(title)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.tight_layout()
    plt.savefig(out_path, bbox_inches="tight")
    plt.close(fig)

In [5]:
# -----------------------
# Load data
# -----------------------
assert TARGET_COL in df_fraud_dataset.columns, f"{TARGET_COL} not found."

# Keep only available columns from KEEP_COLS
feature_cols = [c for c in KEEP_COLS if c in df_fraud_dataset.columns]
X = df_fraud_dataset[feature_cols].copy()
y = df_fraud_dataset[TARGET_COL].astype(int)

# Optional: extract time features if Timestamp exists
if TIMESTAMP_COL in df_fraud_dataset.columns:
    ts = pd.to_datetime(df_fraud_dataset[TIMESTAMP_COL], errors="coerce")
    X["tx_hour"] = ts.dt.hour
    X["tx_dow"] = ts.dt.dayofweek
    if "tx_hour" not in feature_cols: feature_cols.append("tx_hour")
    if "tx_dow" not in feature_cols: feature_cols.append("tx_dow")

# Stratified sample for speed
if MAX_ROWS and len(df_fraud_dataset) > MAX_ROWS:
    frac = MAX_ROWS / len(df_fraud_dataset)
    idx = (df_fraud_dataset[[TARGET_COL]].assign(idx=np.arange(len(df_fraud_dataset)))
           .groupby(TARGET_COL, group_keys=False)
           .apply(lambda x: x.sample(frac=frac, random_state=42)))["idx"].values
    X = X.iloc[idx].reset_index(drop=True)
    y = y.iloc[idx].reset_index(drop=True)

# Train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Identify numeric/categorical columns
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", safe_onehot())
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)

# -----------------------
# Models to train
# -----------------------
models = {
    "LogisticRegression": LogisticRegression(max_iter=300, class_weight="balanced"),
    "RandomForest": RandomForestClassifier(
        n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced"
    ),
}

results = []
pred_cache = {}

for name, estimator in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", estimator)])
    pipe.fit(X_train, y_train)

    # Predictions
    y_pred = pipe.predict(X_test)
    if hasattr(pipe.named_steps["model"], "predict_proba"):
        y_proba = pipe.predict_proba(X_test)[:, 1]
    else:
        # rarely used here, but just in case
        y_proba = y_pred.astype(float)

    # Metrics
    m = {"model": name}
    m.update(compute_metrics(y_test, y_pred, y_proba))
    results.append(m)

    # Persist model
    joblib.dump(pipe, MODELS_DIR / f"{name}.pkl")

    # Cache predictions/probas for best-model artifacts later
    pred_cache[name] = (y_pred, y_proba)

# Save comparison table
cmp_df = pd.DataFrame(results).sort_values(
    by=["pr_auc", "roc_auc", "f1"], ascending=False
).reset_index(drop=True)
cmp_df.to_csv(ARTIFACT_DIR / "model_comparison.csv", index=False)

# Choose best model by PR-AUC then ROC-AUC then F1
best_model = cmp_df.iloc[0]["model"]
y_pred_best, y_proba_best = pred_cache[best_model]

# Save test scores CSV
scored = X_test.copy()
scored["Fraud_Label_true"] = y_test.values
scored["score_proba"] = y_proba_best
scored["pred_label"] = y_pred_best
scored.to_csv(ARTIFACT_DIR / f"test_scores_{best_model}.csv", index=False)

# Confusion matrix & curves for best model
cm = confusion_matrix(y_test, y_pred_best)
plot_and_save_cm(
    cm, f"{best_model} - Confusion Matrix",
    PLOTS_DIR / f"confusion_matrix_{best_model}.png"
)

fpr, tpr, _ = roc_curve(y_test.values, y_proba_best)
plot_and_save_curve(
    fpr, tpr, "False Positive Rate", "True Positive Rate",
    f"ROC Curve - {best_model}", PLOTS_DIR / f"roc_{best_model}.png"
)

prec, rec, _ = precision_recall_curve(y_test.values, y_proba_best)
plot_and_save_curve(
    rec, prec, "Recall", "Precision",
    f"Precision-Recall Curve - {best_model}", PLOTS_DIR / f"pr_{best_model}.png"
)

print("Artifacts saved to:", ARTIFACT_DIR.resolve())
print(cmp_df)


  .apply(lambda x: x.sample(frac=frac, random_state=42)))["idx"].values


Artifacts saved to: D:\vscode101\W7PJ_Project_Fraud_Detection_Transactions\notebooks\artifacts_proto
                model  accuracy  precision    recall        f1   roc_auc  \
0        RandomForest  1.000000      1.000  1.000000  1.000000  1.000000   
1  LogisticRegression  0.781667      0.625  0.803109  0.702948  0.885348   

     pr_auc  n_test  positive_rate_test  
0  1.000000     600            0.321667  
1  0.793421     600            0.321667  
