## 0) Configuration

In [None]:
CSV_PATH = 'Fraud.csv'
TARGET_COL = 'isFraud'
ID_COLS = ['nameOrig','nameDest']
TIME_COL = 'step'
RANDOM_STATE = 42
TEST_SIZE = 0.2
THRESH_GRID = [i/100 for i in range(1,100)]
FN_COST = 500
FP_COST = 5


## 1) Setup

In [None]:
import warnings, math, os, numpy as np, pandas as pd, matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.utils import class_weight
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)
pd.set_option('display.max_columns', 100)
try:
    import shap
    SHAP_OK = True
except Exception:
    SHAP_OK = False
try:
    from lightgbm import LGBMClassifier
    LGB_OK = True
except Exception:
    LGB_OK = False
try:
    from xgboost import XGBClassifier
    XGB_OK = True
except Exception:
    XGB_OK = False


## 2) Load Data

In [None]:
try:
    df = pd.read_csv(CSV_PATH, engine='pyarrow')
except Exception:
    df = pd.read_csv(CSV_PATH)
df_shape = df.shape
df_head = df.head(3)
df_shape, df_head

## 3) Quick EDA & Leakage Checks
Handles missingness overview, class balance, duplicated rows, and potential leak-like columns.

In [None]:
class_ratio = df[TARGET_COL].mean()
na_rates = df.isna().mean().sort_values(ascending=False)
dup_rows = df.duplicated().sum()
leak_like = [c for c in df.columns if c!=TARGET_COL and ('fraud' in c.lower() or 'flag' in c.lower())]
display(pd.Series({'rows':df.shape[0],'cols':df.shape[1],'dup_rows':dup_rows,'positive_rate':class_ratio}))
display(na_rates.head(15).to_frame('na_rate'))
leak_like

## 4) Feature Engineering
Build dense numeric features: deltas, ratios, error terms. Also derive `isMerchantDest` using the merchant naming convention, and one‑hot encode `type`.

In [None]:
df['isMerchantDest'] = df['nameDest'].astype(str).str.startswith('M').astype(int) if 'nameDest' in df.columns else 0
num_cols = [c for c in ['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest'] if c in df.columns]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')
df['orig_delta'] = df['oldbalanceOrg'] - df['newbalanceOrig'] if set(['oldbalanceOrg','newbalanceOrig']).issubset(df.columns) else np.nan
df['dest_delta'] = df['newbalanceDest'] - df['oldbalanceDest'] if set(['oldbalanceDest','newbalanceDest']).issubset(df.columns) else np.nan
df['orig_error'] = df['newbalanceOrig'] - (df['oldbalanceOrg'] - df['amount']) if set(['newbalanceOrig','oldbalanceOrg','amount']).issubset(df.columns) else np.nan
df['dest_error'] = df['newbalanceDest'] - (df['oldbalanceDest'] + df['amount']) if set(['newbalanceDest','oldbalanceDest','amount']).issubset(df.columns) else np.nan
df['amount_over_oldOrg'] = df['amount']/(df['oldbalanceOrg'].replace(0, np.nan)) if set(['amount','oldbalanceOrg']).issubset(df.columns) else np.nan
df['amount_over_oldOrg'] = df['amount_over_oldOrg'].replace([np.inf,-np.inf], np.nan).fillna(0)
cat_cols = [c for c in ['type'] if c in df.columns]
drop_cols = [c for c in ID_COLS if c in df.columns]
X_cols_base = [c for c in df.columns if c not in drop_cols + [TARGET_COL]]
X_cols_base

## 5) Data Cleaning — Missing, Outliers, Multicollinearity

In [None]:
for c in ['oldbalanceDest','newbalanceDest','oldbalanceOrg','newbalanceOrig','orig_delta','dest_delta','orig_error','dest_error']:
    if c in df.columns:
        df[c] = df[c].fillna(-1)
q_low = df['amount'].quantile(0.001) if 'amount' in df.columns else None
q_hi = df['amount'].quantile(0.995) if 'amount' in df.columns else None
if q_low is not None:
    df['amount'] = df['amount'].clip(lower=q_low, upper=q_hi)
num_for_vif = [c for c in ['amount','orig_delta','dest_delta','orig_error','dest_error','amount_over_oldOrg'] if c in df.columns]
vif_df = []
if len(num_for_vif) >= 2:
    Xv = df[num_for_vif].replace([np.inf,-np.inf],0).fillna(0).sample(min(500000, len(df)), random_state=RANDOM_STATE)
    for i,col in enumerate(num_for_vif):
        yv = Xv[col]
        Xvv = Xv.drop(columns=[col])
        lr = LinearRegression().fit(Xvv, yv)
        r2 = lr.score(Xvv, yv)
        vif = 1/(1-r2+1e-9)
        vif_df.append((col, float(vif)))
vif_df = pd.DataFrame(vif_df, columns=['feature','vif']).sort_values('vif', ascending=False)
vif_df

## 6) Train/Validation Split
Time‑aware if `step` exists, else stratified random split.

In [None]:
if TIME_COL in df.columns:
    df = df.sort_values(TIME_COL)
    cut = int(len(df)*(1-TEST_SIZE))
    train_df, val_df = df.iloc[:cut], df.iloc[cut:]
else:
    train_df, val_df = train_test_split(df, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=df[TARGET_COL])
y_train = train_df[TARGET_COL].values
y_val = val_df[TARGET_COL].values
X_train = train_df.drop(columns=[TARGET_COL] + drop_cols)
X_val = val_df.drop(columns=[TARGET_COL] + drop_cols)
num_features = X_train.select_dtypes(include=['number']).columns.tolist()
cat_features = [c for c in cat_cols if c in X_train.columns]
pre = ColumnTransformer([
    ('num','passthrough', num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features)
], remainder='drop')
X_train.shape, X_val.shape, len(num_features), len(cat_features)

## 7) Models
Logistic Regression (interpretable), Gradient Boosting (strong baseline), and optional LightGBM/XGBoost if available.

In [None]:
models = []
models.append(('logreg', Pipeline([('pre', pre), ('clf', LogisticRegression(max_iter=200, class_weight='balanced', n_jobs=None, solver='lbfgs'))])))
models.append(('hgb', Pipeline([('pre', pre), ('clf', HistGradientBoostingClassifier(max_depth=None, learning_rate=0.1, max_iter=200, random_state=RANDOM_STATE))])))
if LGB_OK:
    models.append(('lgbm', Pipeline([('pre', pre), ('clf', LGBMClassifier(random_state=RANDOM_STATE, n_estimators=500, learning_rate=0.05, objective='binary'))])))
if XGB_OK:
    models.append(('xgb', Pipeline([('pre', pre), ('clf', XGBClassifier(random_state=RANDOM_STATE, n_estimators=500, learning_rate=0.05, subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0, objective='binary:logistic', eval_metric='auc'))])))
results = []
for name, pipe in models:
    pipe.fit(X_train, y_train)
    p_val = pipe.predict_proba(X_val)[:,1]
    roc = roc_auc_score(y_val, p_val)
    pr = average_precision_score(y_val, p_val)
    results.append((name, roc, pr, p_val, pipe))
res_df = pd.DataFrame([(n, r, p) for n,r,p,_,_ in results], columns=['model','roc_auc','pr_auc']).sort_values(['pr_auc','roc_auc'], ascending=False)
res_df

## 8) Curves: ROC and Precision‑Recall (Validation)

In [None]:
best = sorted(results, key=lambda x: (x[2], x[1]), reverse=True)[0]
best_name, best_roc, best_pr, best_pval, best_pipe = best
fpr, tpr, thr = roc_curve(y_val, best_pval)
plt.figure(); plt.plot(fpr, tpr); plt.plot([0,1],[0,1]); plt.xlabel('FPR'); plt.ylabel('TPR'); plt.title('ROC: '+best_name)
pr, rc, thr2 = precision_recall_curve(y_val, best_pval)
plt.figure(); plt.plot(rc, pr); plt.xlabel('Recall'); plt.ylabel('Precision'); plt.title('PR: '+best_name)
best_name, best_roc, best_pr

## 9) Cost‑Sensitive Threshold Optimization

In [None]:
def eval_threshold(y_true, p, t):
    yhat = (p>=t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, yhat).ravel()
    cost = fp*FP_COST + fn*FN_COST
    return {'t':t,'tp':int(tp),'fp':int(fp),'tn':int(tn),'fn':int(fn),'precision': tp/max(1,tp+fp), 'recall': tp/max(1,tp+fn), 'f1': (2*tp)/max(1,2*tp+fp+fn), 'cost': cost}
grid = [eval_threshold(y_val, best_pval, t) for t in THRESH_GRID]
gdf = pd.DataFrame(grid).sort_values('cost')
t_star = float(gdf.iloc[0]['t'])
star = gdf.iloc[0].to_dict()
plt.figure(); plt.plot(gdf['t'], gdf['cost']); plt.xlabel('Threshold'); plt.ylabel('Expected Cost'); plt.title('Cost vs Threshold')
star, t_star

## 10) Final Validation Metrics at Cost‑Optimal Threshold

In [None]:
yhat_star = (best_pval>=t_star).astype(int)
cm = confusion_matrix(y_val, yhat_star)
report = classification_report(y_val, yhat_star, output_dict=False)
plt.figure();
plt.imshow(cm, interpolation='nearest'); plt.title('Confusion Matrix'); plt.xlabel('Predicted'); plt.ylabel('Actual');
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha='center', va='center')
report

## 11) Feature Importance & Explanations

In [None]:
feature_names = []
num_features = [c for c in X_train.columns if c in best_pipe.named_steps['pre'].transformers_[0][2]]
cat_features = best_pipe.named_steps['pre'].transformers_[1][2] if len(best_pipe.named_steps['pre'].transformers_)>1 else []
ohe = best_pipe.named_steps['pre'].named_transformers_['cat'] if len(cat_features)>0 else None
if ohe is not None:
    ohe_names = ohe.get_feature_names_out(cat_features).tolist()
else:
    ohe_names = []
feature_names = num_features + ohe_names
try:
    if SHAP_OK and hasattr(best_pipe.named_steps['clf'], 'predict_proba'):
        Xs = best_pipe.named_steps['pre'].transform(X_val.sample(min(10000,len(X_val)), random_state=RANDOM_STATE))
        explainer = shap.Explainer(best_pipe.named_steps['clf'])
        sv = explainer(Xs)
        shap.plots.beeswarm(sv, max_display=20)
except Exception as e:
    pass
if not SHAP_OK:
    Xs = X_val.sample(min(5000, len(X_val)), random_state=RANDOM_STATE)
    perm = permutation_importance(best_pipe, Xs, y_val[:len(Xs)], n_repeats=3, random_state=RANDOM_STATE)
    imp = pd.Series(perm.importances_mean, index=feature_names).sort_values(ascending=False)[:20]
    plt.figure(); imp.iloc[:20].plot(kind='bar'); plt.title('Permutation Importance (Top 20)'); plt.tight_layout()
    imp.head(20)

## 12) Variable Selection Strategy
1) Start with domain‑derived engineered features (deltas, errors, ratios). 2) Encode `type`. 3) Remove identifiers. 4) Winsorize extreme `amount`. 5) Examine VIF to avoid redundant numeric features. 6) Keep features that improve PR‑AUC and reduce cost on validation. 7) Prefer parsimonious sets if two variants tie on cost.

The next cell runs a compact forward‑style selection on top of the preprocessor to confirm robustness for the best model.

In [None]:
base_feats = [c for c in ['amount','isMerchantDest','amount_over_oldOrg'] if c in X_train.columns]
cand_feats = [c for c in X_train.columns if c not in base_feats]
sel = set(base_feats)
best_cost = None
history = []
for c in cand_feats:
    cols = list(sel | {c})
    num_f = [f for f in cols if f not in ['type'] and X_train[f].dtype!='O']
    cat_f = ['type'] if 'type' in cols else []
    pre2 = ColumnTransformer([('num','passthrough',num_f), ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_f)], remainder='drop')
    pipe2 = Pipeline([('pre', pre2), ('clf', HistGradientBoostingClassifier(max_iter=200, random_state=RANDOM_STATE))])
    pipe2.fit(X_train[cols], y_train)
    p = pipe2.predict_proba(X_val[cols])[:,1]
    g = [eval_threshold(y_val, p, t) for t in THRESH_GRID]
    cmin = min(g, key=lambda d: d['cost'])['cost']
    history.append((c, cmin))
    if best_cost is None or cmin < best_cost:
        best_cost = cmin
        sel.add(c)
sel, sorted(history, key=lambda x: x[1])[:10]


## 13) Export Artifacts

In [None]:
import joblib
MODEL_PATH = '/mnt/data/fraud_model.joblib'
joblib.dump(best_pipe, MODEL_PATH)
MODEL_PATH

## 14) Actionable Insights — Key Drivers
- Large `amount` relative to `oldbalanceOrg`
- Mismatch in balance equations via `orig_error` and `dest_error`
- Destination merchant indicator `isMerchantDest`
- Transaction `type` patterns (e.g., TRANSFER → CASH_OUT)

These align with known fraud modus operandi where compromised accounts attempt rapid fund transfer and cash‑out, often to merchant‑like accounts.

Use the importance/SHAP view above to tailor the final narrative to your fitted model.

## 15) Prevention Recommendations
- Real‑time holds for high‑risk scores at threshold *t★*
- Velocity checks over `step` window: cumulative amount and count by `nameOrig` and `nameDest`
- Step‑up authentication for risky `type` flows and first‑time counterparties
- Dest account risk graph: prior fraud adjacency, merchant patterns, sudden degree spikes
- Adaptive limits: per‑user dynamic cap tied to behavioral baseline
- Periodic model recalibration and concept‑drift monitoring

## 16) How To Measure Impact
- Launch as shadow mode, then phased A/B rollout
- Primary KPIs: fraud loss, recovery rate, manual review load, customer friction rate
- Secondary KPIs: model precision at *t★*, recall, PR‑AUC; false‑positive appeal rate
- Use interleaved evaluation across time bands to avoid step‑based drift leakage
- Weekly drift dashboard on score distribution and population stability index