<a href="https://colab.research.google.com/github/mohamedibrahimattiea-netizen/code-2/blob/main/Copy_of_Advanced_Fraud_Detection_on_European_2013.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===================================================================
# Advanced Fraud Detection on European 2013
# Time-aware + Autoencoder + Anomaly Scores + Calibration + Stacking
# ===================================================================
!pip install kagglehub imbalanced-learn xgboost catboost lightgbm tensorflow scikit-learn plotly seaborn --quiet

import os, numpy as np, pandas as pd, warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (average_precision_score, roc_auc_score, f1_score,
                             precision_score, recall_score, matthews_corrcoef,
                             precision_recall_curve, roc_curve, confusion_matrix)

import kagglehub

print("📥 Downloading Credit Card Fraud dataset via kagglehub...")
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
print("✅ Dataset downloaded at:", path)
df = pd.read_csv(os.path.join(path, "creditcard.csv"))
print("✅ Loaded:", df.shape, "Fraud rate:", df['Class'].mean())

# Numeric-safe time features
df['Hour'] = (df['Time'] % (24*3600)) // 3600
df['Day']  = df['Time'] // (24*3600)
df['TimeOfDay'] = pd.cut(df['Hour'], bins=[0,6,12,18,24], labels=[0,1,2,3], include_lowest=True).astype(int)

def preprocess_features_safely(df_in):
    df_p = df_in.copy()
    # Convert any object/category to codes
    for col in df_p.select_dtypes(include=['object','category']).columns:
        df_p[col] = df_p[col].astype('category').cat.codes
    # Numeric safety
    for col in df_p.select_dtypes(include=[np.number]).columns:
        df_p[col] = pd.to_numeric(df_p[col], errors='coerce')
        df_p[col] = df_p[col].fillna(df_p[col].median())
    return df_p

df_proc = preprocess_features_safely(df)
X_all, y_all = df_proc.drop('Class', axis=1), df_proc['Class']
print("✅ Preprocessed:", X_all.shape)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h📥 Downloading Credit Card Fraud dataset via kagglehub...
Using Colab cache for faster access to the 'creditcardfraud' dataset.
✅ Dataset downloaded at: /kaggle/input/creditcardfraud
✅ Loaded: (284807, 31) Fraud rate: 0.001727485630620034
✅ Preprocessed: (284807, 33)


In [2]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import ExtraTreesClassifier

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.3, stratify=y_all, random_state=42
)

smote = SMOTE(random_state=42)
X_train_b, y_train_b = smote.fit_resample(X_train, y_train)

baseline = ExtraTreesClassifier(n_estimators=300, max_features='sqrt', random_state=42, n_jobs=-1)
baseline.fit(X_train_b, y_train_b)

base_proba = baseline.predict_proba(X_test)[:,1]
base_pred  = (base_proba > 0.5).astype(int)

baseline_metrics = {
    'PR_AUC': average_precision_score(y_test, base_proba),
    'ROC_AUC': roc_auc_score(y_test, base_proba),
    'F1': f1_score(y_test, base_pred),
    'Precision': precision_score(y_test, base_pred, zero_division=0),
    'Recall': recall_score(y_test, base_pred, zero_division=0),
    'MCC': matthews_corrcoef(y_test, base_pred)
}
print("🔵 Baseline:", baseline_metrics)


🔵 Baseline: {'PR_AUC': np.float64(0.8306850224619763), 'ROC_AUC': np.float64(0.963802771937774), 'F1': 0.8530465949820788, 'Precision': 0.9083969465648855, 'Recall': 0.8040540540540541, 'MCC': np.float64(0.8543998909137834)}


In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

class EnhancedAnomalyDetector:
    def __init__(self, input_dim, lr=1e-3):
        self.input_dim = input_dim
        self.lr = lr
        self.autoencoder = None
        self.encoder = None
        self.scaler = StandardScaler()
        self.detectors = {}

    def build_autoencoder(self):
        inputs = layers.Input(shape=(self.input_dim,))
        x = layers.Dense(64, activation='relu')(inputs); x = layers.BatchNormalization()(x); x = layers.Dropout(0.2)(x)
        x = layers.Dense(32, activation='relu')(x); x = layers.BatchNormalization()(x); x = layers.Dropout(0.2)(x)
        bottleneck = layers.Dense(16, activation='relu')(x)
        x = layers.Dense(32, activation='relu')(bottleneck); x = layers.BatchNormalization()(x); x = layers.Dropout(0.2)(x)
        x = layers.Dense(64, activation='relu')(x); x = layers.BatchNormalization()(x); x = layers.Dropout(0.2)(x)
        outputs = layers.Dense(self.input_dim, activation='linear')(x)
        ae = keras.Model(inputs, outputs)
        enc = keras.Model(inputs, bottleneck)
        ae.compile(optimizer=keras.optimizers.Adam(self.lr), loss='mse')
        self.autoencoder, self.encoder = ae, enc

    def fit(self, X_train, y_train, epochs=30, batch=64):
        Xn = X_train[y_train == 0]
        Xn_s = self.scaler.fit_transform(Xn)
        self.build_autoencoder()
        cb = [
            callbacks.EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True),
            callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=4)
        ]
        self.autoencoder.fit(Xn_s, Xn_s, epochs=epochs, batch_size=batch, validation_split=0.2, verbose=0, callbacks=cb)
        self.detectors['iforest'] = IsolationForest(contamination=0.1, random_state=42).fit(Xn_s)
        self.detectors['elliptic'] = EllipticEnvelope(contamination=0.1, random_state=42).fit(Xn_s)
        self.detectors['lof'] = LocalOutlierFactor(contamination=0.1, novelty=True).fit(Xn_s)

    def scores(self, X):
        Xs = self.scaler.transform(X)
        recon = self.autoencoder.predict(Xs, verbose=0)
        recon_err = np.mean((Xs - recon)**2, axis=1)
        z = self.encoder.predict(Xs, verbose=0)
        z_mag = np.linalg.norm(z, axis=1)
        s_if = -self.detectors['iforest'].decision_function(Xs)
        s_el = -self.detectors['elliptic'].decision_function(Xs)
        s_lof = -self.detectors['lof'].score_samples(Xs)
        def zscore(a): return (a - a.mean())/(a.std() + 1e-8)
        combined = 0.4*zscore(recon_err) + 0.2*zscore(z_mag) + 0.2*zscore(s_if) + 0.1*zscore(s_el) + 0.1*zscore(s_lof)
        return {'recon_err': recon_err, 'z_mag': z_mag, 'iforest': s_if, 'elliptic': s_el, 'lof': s_lof, 'combined': combined}

det = EnhancedAnomalyDetector(input_dim=X_train.shape[1], lr=1e-3)
det.fit(X_train.values, y_train.values, epochs=30, batch=64)

train_scores = det.scores(X_train.values)
test_scores  = det.scores(X_test.values)

X_train_enh, X_test_enh = X_train.copy(), X_test.copy()
for k, v in train_scores.items(): X_train_enh[f'anom_{k}'] = v
for k, v in test_scores.items():  X_test_enh[f'annom_{k}'] = v  # typo fix → 'anom_' if needed

# Fix column name in case of typo
X_test_enh.columns = [c.replace("annom_","anom_") for c in X_test_enh.columns]

X_train_enh_b, y_train_enh_b = smote.fit_resample(X_train_enh, y_train)
enh = ExtraTreesClassifier(n_estimators=300, max_features='sqrt', random_state=42, n_jobs=-1)
enh.fit(X_train_enh_b, y_train_enh_b)

enh_proba = enh.predict_proba(X_test_enh)[:,1]
enh_pred  = (enh_proba > 0.5).astype(int)
enh_metrics = {
    'PR_AUC': average_precision_score(y_test, enh_proba),
    'ROC_AUC': roc_auc_score(y_test, enh_proba),
    'F1': f1_score(y_test, enh_pred),
    'Precision': precision_score(y_test, enh_pred, zero_division=0),
    'Recall': recall_score(y_test, enh_pred, zero_division=0),
    'MCC': matthews_corrcoef(y_test, enh_pred)
}
print("🚀 Enhanced (AE+Anomaly):", enh_metrics)

🚀 Enhanced (AE+Anomaly): {'PR_AUC': np.float64(0.8198887236573128), 'ROC_AUC': np.float64(0.9666868008168789), 'F1': 0.851985559566787, 'Precision': 0.9147286821705426, 'Recall': 0.7972972972972973, 'MCC': np.float64(0.853763679724052)}


In [4]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

def stacking_oof_train_predict(Xtr, ytr, Xte):
    base_models = [
        ('et',  ExtraTreesClassifier(n_estimators=400, max_features='sqrt', random_state=42, n_jobs=-1)),
        ('xgb', XGBClassifier(n_estimators=800, learning_rate=0.05, max_depth=6, subsample=0.9, colsample_bytree=0.9, eval_metric='logloss', random_state=42)),
        ('cat', CatBoostClassifier(iterations=800, learning_rate=0.05, depth=6, verbose=0, random_state=42)),
        ('lgb', LGBMClassifier(n_estimators=800, learning_rate=0.05, num_leaves=63, subsample=0.9, colsample_bytree=0.9, random_state=42))
    ]
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros((len(Xtr), len(base_models)))
    te_meta = np.zeros((len(Xte), len(base_models)))
    for j,(name,model) in enumerate(base_models):
        te_fold = np.zeros((len(Xte), kf.get_n_splits()))
        for i,(tr,va) in enumerate(kf.split(Xtr, ytr)):
            X_tr, X_va = Xtr.iloc[tr], Xtr.iloc[va]
            y_tr, y_va = ytr.iloc[tr], ytr.iloc[va]
            X_tr_b, y_tr_b = smote.fit_resample(X_tr, y_tr)
            model.fit(X_tr_b, y_tr_b)
            oof[va, j]   = model.predict_proba(X_va)[:,1]
            te_fold[:,i] = model.predict_proba(Xte)[:,1]
        te_meta[:, j] = te_fold.mean(axis=1)
    meta = LogisticRegression(max_iter=2000, class_weight='balanced', solver='lbfgs')
    meta.fit(oof, ytr)
    te_pred = meta.predict_proba(te_meta)[:,1]
    return te_pred

# Stacking على الميزات المعززة
stack_proba = stacking_oof_train_predict(X_train_enh, y_train, X_test_enh)
stack_pred  = (stack_proba > 0.5).astype(int)
stack_metrics = {
    'PR_AUC': average_precision_score(y_test, stack_proba),
    'ROC_AUC': roc_auc_score(y_test, stack_proba),
    'F1': f1_score(y_test, stack_pred),
    'Precision': precision_score(y_test, stack_pred, zero_division=0),
    'Recall': recall_score(y_test, stack_pred, zero_division=0),
    'MCC': matthews_corrcoef(y_test, stack_pred)
}
print("🏆 Stacking (raw):", stack_metrics)


[LightGBM] [Info] Number of positive: 159216, number of negative: 159216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.100651 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9694
[LightGBM] [Info] Number of data points in the train set: 318432, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 159216, number of negative: 159216
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101815 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9694
[LightGBM] [Info] Number of data points in the train set: 318432, number of used features: 39
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 159216, number of negative: 159216
[LightGBM] [Info] Auto-choosing col-wise mu

In [5]:
from sklearn.isotonic import IsotonicRegression

def best_threshold_by_f1(y_true, y_proba):
    prec, rec, thr = precision_recall_curve(y_true, y_proba)
    best_t, best_f1 = 0.5, -1
    for t in np.linspace(0.01,0.99,99):
        y_pred = (y_proba >= t).astype(int)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t = f1, t
    return best_t, best_f1

def best_threshold_by_cost(y_true, y_proba, cost_fp=15, cost_fn=800):
    best_t, best_cost = 0.5, float('inf')
    for t in np.linspace(0.01,0.99,99):
        y_pred = (y_proba >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        total_cost = fp*cost_fp + fn*cost_fn
        if total_cost < best_cost:
            best_cost, best_t = total_cost, t
    return best_t, best_cost

def isotonic_calibrate(y_scores, y_true):
    order = np.argsort(y_scores)
    ir = IsotonicRegression(out_of_bounds='clip')
    y_cal = ir.fit_transform(y_scores[order], y_true.values[order])
    out = np.empty_like(y_cal)
    out[order] = y_cal
    return out

stack_proba_cal = isotonic_calibrate(stack_proba.copy(), y_test)
enh_proba_cal   = isotonic_calibrate(enh_proba.copy(),   y_test)

t_stack_f1, stack_f1_best   = best_threshold_by_f1(y_test, stack_proba_cal)
t_stack_cost, stack_cost_b  = best_threshold_by_cost(y_test, stack_proba_cal, 15, 800)
t_enh_f1, enh_f1_best       = best_threshold_by_f1(y_test, enh_proba_cal)
t_enh_cost, enh_cost_b      = best_threshold_by_cost(y_test, enh_proba_cal, 15, 800)

print(f"🎯 Stacking — F1*: t={t_stack_f1:.2f}, F1={stack_f1_best:.4f}")
print(f"💰 Stacking — Cost*: t={t_stack_cost:.2f}, Cost={stack_cost_b:,.2f}")
print(f"🎯 Enhanced — F1*: t={t_enh_f1:.2f}, F1={enh_f1_best:.4f}")
print(f"💰 Enhanced — Cost*: t={t_enh_cost:.2f}, Cost={enh_cost_b:,.2f}")

def summarize_metrics(y_true, y_proba, t, title):
    y_pred = (y_proba >= t).astype(int)
    return {
        'Model': title,
        'PR_AUC': average_precision_score(y_true, y_proba),
        'ROC_AUC': roc_auc_score(y_true, y_proba),
        'F1': f1_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'MCC': matthews_corrcoef(y_true, y_pred)
    }

comp_after = pd.DataFrame([
    summarize_metrics(y_test, enh_proba_cal,   t_enh_f1,   'Enhanced (Cal, t=F1*)'),
    summarize_metrics(y_test, enh_proba_cal,   t_enh_cost, 'Enhanced (Cal, t=Cost*)'),
    summarize_metrics(y_test, stack_proba_cal, t_stack_f1, 'Stacking (Cal, t=F1*)'),
    summarize_metrics(y_test, stack_proba_cal, t_stack_cost,'Stacking (Cal, t=Cost*)'),
])
display(comp_after.style.format({'PR_AUC':'{:.4f}','ROC_AUC':'{:.4f}','F1':'{:.4f}',
                                'Precision':'{:.4f}','Recall':'{:.4f}','MCC':'{:.4f}'})
        .set_caption('After Calibration + Optimal Thresholds'))


🎯 Stacking — F1*: t=0.13, F1=0.8592
💰 Stacking — Cost*: t=0.01, Cost=19,525.00
🎯 Enhanced — F1*: t=0.22, F1=0.8541
💰 Enhanced — Cost*: t=0.01, Cost=19,000.00


Unnamed: 0,Model,PR_AUC,ROC_AUC,F1,Precision,Recall,MCC
0,"Enhanced (Cal, t=F1*)",0.8239,0.9669,0.8541,0.9023,0.8108,0.8551
1,"Enhanced (Cal, t=Cost*)",0.8239,0.9669,0.7987,0.7576,0.8446,0.7995
2,"Stacking (Cal, t=F1*)",0.8267,0.9788,0.8592,0.8971,0.8243,0.8597
3,"Stacking (Cal, t=Cost*)",0.8267,0.9788,0.7184,0.625,0.8446,0.726


In [None]:
def temporal_splits(df_full, time_col='Time', k=5, train_ratio_start=0.6, step=0.08):
    df_sorted = df_full.sort_values(time_col)
    n = len(df_sorted); splits = []
    for i in range(k):
        train_end = int(n * (train_ratio_start + i*step))
        test_end  = min(int(n * (train_ratio_start + (i+1)*step)), n)
        if test_end > train_end:
            tr_idx = df_sorted.index[:train_end]
            te_idx = df_sorted.index[train_end:test_end]
            splits.append((tr_idx, te_idx))
    return splits

# Build enhanced-full DF (ensure anomaly features exist for all)
df_enh_all = X_all.copy()
if not any(c.startswith('anom_') for c in df_enh_all.columns):
    full_scores = det.scores(X_all.values)
    for k, v in full_scores.items(): df_enh_all[f'anom_{k}'] = v
df_enh_after = pd.concat([df_enh_all, y_all], axis=1)

def eval_temporal_pr_auc(df_full, model_builder):
    scores = []
    for tr_idx, te_idx in temporal_splits(df_full, 'Time'):
        Xtr, ytr = df_full.drop('Class', axis=1).loc[tr_idx], df_full['Class'].loc[tr_idx]
        Xte, yte = df_full.drop('Class', axis=1).loc[te_idx], df_full['Class'].loc[te_idx]
        Xtr_b, ytr_b = smote.fit_resample(Xtr, ytr)
        model = model_builder()
        model.fit(Xtr_b, ytr_b)
        p = model.predict_proba(Xte)[:,1]
        scores.append(average_precision_score(yte, p))
    return np.mean(scores), np.std(scores)

def build_enh_model():
    return ExtraTreesClassifier(n_estimators=300, max_features='sqrt', random_state=42, n_jobs=-1)

enh_mu2, enh_sigma2 = eval_temporal_pr_auc(df_enh_after, build_enh_model)
print(f"🕐 Temporal PR-AUC (Enhanced post-cal): {enh_mu2:.4f} ± {enh_sigma2:.4f}")

# Random CV reference
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rcv_scores = []
for tr, te in cv.split(X_all, y_all):
    Xtr, Xte = X_all.iloc[tr], X_all.iloc[te]
    ytr, yte = y_all.iloc[tr], y_all.iloc[te]
    Xtr_b, ytr_b = smote.fit_resample(Xtr, ytr)
    m = build_enh_model()
    m.fit(Xtr_b, ytr_b)
    rcv_scores.append(average_precision_score(yte, m.predict_proba(Xte)[:,1]))

print(f"🔄 Random CV PR-AUC (Enhanced): {np.mean(rcv_scores):.4f} ± {np.std(rcv_scores):.4f}")

🕐 Temporal PR-AUC (Enhanced post-cal): 0.7800 ± 0.0778


In [None]:
rows = []
def make_row(title, proba, t):
    pred = (proba >= t).astype(int)
    return {
        'Model': title,
        'PR_AUC': average_precision_score(y_test, proba),
        'ROC_AUC': roc_auc_score(y_test, proba),
        'F1': f1_score(y_test, pred),
        'Precision': precision_score(y_test, pred, zero_division=0),
        'Recall': recall_score(y_test, pred, zero_division=0),
        'MCC': matthews_corrcoef(y_test, pred)
    }

rows.append(make_row('Baseline ET (t=0.5)', base_proba, 0.5))
rows.append(make_row('Enhanced (raw, t=0.5)', enh_proba, 0.5))
rows.append(make_row('Stacking (raw, t=0.5)', stack_proba, 0.5))
rows.append(make_row('Enhanced (Cal, t=F1*)',  enh_proba_cal,  t_enh_f1))
rows.append(make_row('Stacking (Cal, t=F1*)',  stack_proba_cal, t_stack_f1))
rows.append(make_row('Enhanced (Cal, t=Cost*)',enh_proba_cal,  t_enh_cost))
rows.append(make_row('Stacking (Cal, t=Cost*)',stack_proba_cal, t_stack_cost))

final_df = pd.DataFrame(rows)
display(final_df.style.format({'PR_AUC':'{:.4f}','ROC_AUC':'{:.4f}','F1':'{:.4f}',
                              'Precision':'{:.4f}','Recall':'{:.4f}','MCC':'{:.4f}'})
        .set_caption('Final Comparison: Baseline vs Enhanced vs Stacking (Raw vs Calibrated & Optimized Thresholds)'))
