In [1]:
# ============================================
# 0. Imports and basic setup
# ============================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, PowerTransformer

from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    precision_score,
    recall_score,
)

import warnings
warnings.filterwarnings("ignore")   # you can comment this out if you want to see all warnings

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ============================================
# 1. Synthetic fraud / mule dataset generation
# ============================================

n = 50_000
fraud_rate = 0.05
n_fraud = int(n * fraud_rate)

# ---- basic profile ----
ages = np.random.randint(18, 75, size=n)
cities = np.random.choice(
    ['Delhi', 'Mumbai', 'Bengaluru', 'Hyderabad', 'Kolkata',
     'Chennai', 'Pune', 'Ahmedabad', 'Jaipur', 'Other'],
    size=n
)
account_tenure = np.random.randint(1, 240, size=n)
avg_balance = np.random.uniform(500, 500_000, size=n)
kyc_types = np.random.choice(['Full KYC', 'Minimum KYC', 'eKYC'],
                             size=n, p=[0.6, 0.25, 0.15])

# ---- base transactional patterns ----
total_inflow_24h = np.random.gamma(shape=2.0, scale=10_000, size=n)
total_outflow_24h = np.random.gamma(shape=2.0, scale=9_000, size=n)

count_inflow_24h = np.random.poisson(1.5, size=n)
count_outflow_24h = np.random.poisson(1.8, size=n)
count_unique_creditors_24h = np.random.poisson(1.0, size=n)

time_diff_first_inflow_to_outflow = np.random.exponential(scale=120, size=n)  # minutes
percent_inflow_cashed_out_1h = np.random.uniform(0, 80, size=n)

velocity_inflow_1h = np.random.poisson(0.8, size=n)
velocity_outflow_1h = np.random.poisson(0.9, size=n)

device_change = np.random.choice([0, 1], size=n, p=[0.96, 0.04])
new_payee = np.random.choice([0, 1], size=n, p=[0.9, 0.1])
intl_ip = np.random.choice([0, 1], size=n, p=[0.97, 0.03])

txn_amount = np.random.gamma(shape=2.0, scale=8_000, size=n)
txn_hour = np.random.randint(0, 24, size=n)

merchant_categories = np.random.choice(
    ['Groceries', 'Electronics', 'Food', 'Travel', 'Utility',
     'Entertainment', 'Gaming', 'Crypto', 'Wallet', 'Other'],
    size=n,
    p=[0.20, 0.15, 0.15, 0.08, 0.10, 0.10, 0.06, 0.05, 0.06, 0.05]
)

# ---- target: is_fraud (mule) ----
y = np.zeros(n, dtype=int)
fraud_idx = np.random.choice(np.arange(n), size=n_fraud, replace=False)
y[fraud_idx] = 1

# ---- inject stronger fraud patterns for rows labelled 1 ----
# higher inflow/outflow, many creditors, quick cash-out, night time, riskier merchants
total_inflow_24h[fraud_idx] *= np.random.uniform(2, 5, size=n_fraud)
total_outflow_24h[fraud_idx] *= np.random.uniform(2, 5, size=n_fraud)

count_inflow_24h[fraud_idx] += np.random.poisson(2, size=n_fraud)
count_outflow_24h[fraud_idx] += np.random.poisson(3, size=n_fraud)
count_unique_creditors_24h[fraud_idx] += np.random.poisson(2, size=n_fraud)

time_diff_first_inflow_to_outflow[fraud_idx] *= np.random.uniform(0.01, 0.3, size=n_fraud)
percent_inflow_cashed_out_1h[fraud_idx] = np.random.uniform(70, 100, size=n_fraud)

velocity_inflow_1h[fraud_idx] += np.random.poisson(2, size=n_fraud)
velocity_outflow_1h[fraud_idx] += np.random.poisson(3, size=n_fraud)

device_change[fraud_idx] = np.random.choice([0, 1], size=n_fraud, p=[0.3, 0.7])
new_payee[fraud_idx] = 1
intl_ip[fraud_idx] = np.random.choice([0, 1], size=n_fraud, p=[0.3, 0.7])

txn_amount[fraud_idx] *= np.random.uniform(2, 4, size=n_fraud)
txn_hour[fraud_idx] = np.random.choice([0, 1, 2, 3, 4], size=n_fraud)  # night hours

merchant_categories[fraud_idx] = np.random.choice(
    ['Crypto', 'Wallet', 'Gaming', 'Travel', 'Electronics'],
    size=n_fraud,
    p=[0.3, 0.3, 0.15, 0.15, 0.10]
)

# ---- build DataFrame ----
df = pd.DataFrame({
    "age": ages,
    "city": cities,
    "account_tenure_months": account_tenure,
    "avg_monthly_balance": avg_balance,
    "kyc_type": kyc_types,
    "total_inflow_24hr": total_inflow_24h,
    "count_inflow_24hr": count_inflow_24h,
    "count_unique_creditors_24hr": count_unique_creditors_24h,
    "total_outflow_24hr": total_outflow_24h,
    "count_outflow_24hr": count_outflow_24h,
    "time_diff_first_inflow_to_outflow": time_diff_first_inflow_to_outflow,
    "percent_inflow_cashed_out_1hr": percent_inflow_cashed_out_1h,
    "velocity_inflow_1hr": velocity_inflow_1h,
    "velocity_outflow_1hr": velocity_outflow_1h,
    "device_change_last_48hr": device_change,
    "new_payee_added_last_7d": new_payee,
    "international_ip_flag": intl_ip,
    "txn_amount": txn_amount,
    "txn_hour": txn_hour,
    "merchant_category": merchant_categories,
    "is_fraud": y,
})

# add some missing values to mimic real data
for col in df.columns:
    if col != "is_fraud":
        mask = np.random.rand(n) < 0.05   # 5% missing
        df.loc[mask, col] = np.nan

print(df.head())
print("Event rate (overall):", df["is_fraud"].mean())

# ============================================
# 2. Train-test split (stratified)
# ============================================

X = df.drop("is_fraud", axis=1)
y = df["is_fraud"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

print("Train event rate:", y_train.mean())
print("Test event rate:", y_test.mean())

# ============================================
# 3. Preprocessing: ColumnTransformer
# ============================================

# specify columns
numeric_features = [
    "age",
    "account_tenure_months",
    "avg_monthly_balance",
    "total_inflow_24hr",
    "count_inflow_24hr",
    "count_unique_creditors_24hr",
    "total_outflow_24hr",
    "count_outflow_24hr",
    "time_diff_first_inflow_to_outflow",
    "percent_inflow_cashed_out_1hr",
    "velocity_inflow_1hr",
    "velocity_outflow_1hr",
    "txn_amount",
    "txn_hour",
]

nominal_cats = ["city", "merchant_category"]

ordinal_cats = ["kyc_type"]
kyc_order = [["eKYC", "Minimum KYC", "Full KYC"]]

binary_int_features = [
    "device_change_last_48hr",
    "new_payee_added_last_7d",
    "international_ip_flag",
]

# numeric pipeline: impute + PowerTransformer
num_pipeline = SkPipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("power", PowerTransformer(standardize=True))
])

# nominal categorical: impute + one-hot
nom_pipeline = SkPipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# ordinal categorical: impute + ordinal encoding
ord_pipeline = SkPipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ord", OrdinalEncoder(categories=kyc_order,
                           handle_unknown="use_encoded_value",
                           unknown_value=-1))
])

# binary int features: just impute
bin_pipeline = SkPipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent"))
])

# column transformer
preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numeric_features),
        ("nom", nom_pipeline, nominal_cats),
        ("ord", ord_pipeline, ordinal_cats),
        ("bin", bin_pipeline, binary_int_features),
    ],
    remainder="drop"
)

# ============================================
# 4. Final pipeline with SMOTE + BalancedRandomForest
# ============================================

# imblearn pipeline (supports fit_resample)
pipe = ImbPipeline(steps=[
    ("preprocess", preprocess),
    ("smote", SMOTE(random_state=RANDOM_STATE)),
    ("clf", BalancedRandomForestClassifier(random_state=RANDOM_STATE))
])

# ============================================
# 5. GridSearchCV (optimize recall)
# ============================================

param_grid = {
    "smote__sampling_strategy": [0.5, 0.8, 1.0],
    "smote__k_neighbors": [3, 5],

    "clf__n_estimators": [100, 200],
    "clf__max_depth": [5, None],
    "clf__max_features": ["sqrt", "log2"],
}

gscv = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="recall",      # we care about catching as many frauds as possible
    cv=3,
    n_jobs=-1,
    verbose=2
)

gscv.fit(X_train, y_train)

print("Best params:", gscv.best_params_)
print("Best CV recall:", gscv.best_score_)

best_model = gscv.best_estimator_

# ============================================
# 6. Evaluate best model at default threshold 0.5
# ============================================

y_pred_default = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("\n=== Test performance at default threshold 0.5 ===")
print(classification_report(y_test, y_pred_default, digits=4, zero_division=0))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# ============================================
# 7. Threshold tuning: maximize recall with a minimum precision
# ============================================

thresholds = np.arange(0.01, 0.91, 0.01)
recalls = []
precisions = []

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    recalls.append(recall_score(y_test, y_pred_t))
    precisions.append(precision_score(y_test, y_pred_t, zero_division=0))

recalls = np.array(recalls)
precisions = np.array(precisions)

# business rule: choose threshold with max recall, but precision >= 0.05
min_precision = 0.05
valid = precisions >= min_precision

if valid.sum() == 0:
    # fallback: choose pure max recall, even if precision is low
    idx = np.argmax(recalls)
else:
    idx = np.argmax(recalls[valid])
    # map back to original indices
    valid_indices = np.where(valid)[0]
    idx = valid_indices[idx]

best_threshold = thresholds[idx]
best_recall = recalls[idx]
best_precision = precisions[idx]

print("\nChosen threshold based on recall-max with precision >= {:.2f}: {:.2f}".format(
    min_precision, best_threshold))
print("Recall at chosen threshold:", best_recall)
print("Precision at chosen threshold:", best_precision)

# final predictions with tuned threshold
y_pred_tuned = (y_proba >= best_threshold).astype(int)

print("\n=== Test performance at tuned threshold ===")
print(classification_report(y_test, y_pred_tuned, digits=4, zero_division=0))
print("ROC AUC (unchanged):", roc_auc_score(y_test, y_proba))


    age       city  account_tenure_months  avg_monthly_balance     kyc_type  \
0  56.0  Hyderabad                  113.0        388321.944668  Minimum KYC   
1   NaN      Delhi                   12.0        246884.328770  Minimum KYC   
2  46.0    Chennai                  108.0        362940.083765     Full KYC   
3  32.0  Hyderabad                  161.0        283517.916350     Full KYC   
4  60.0  Bengaluru                  226.0        356497.736977  Minimum KYC   

   total_inflow_24hr  count_inflow_24hr  count_unique_creditors_24hr  \
0       34309.708317                1.0                          4.0   
1       27178.216145                4.0                          0.0   
2       36727.285662                2.0                          2.0   
3       27645.079123                3.0                          2.0   
4       40563.375725                2.0                          1.0   

   total_outflow_24hr  count_outflow_24hr  ...  percent_inflow_cashed_out_1hr  \
0       120

In [3]:
best_model = gscv.best_estimator_
# Extract the preprocessing and classifier
preprocess_only = best_model.named_steps["preprocess"]
clf_only         = best_model.named_steps["clf"]

# Build final production pipeline (NO SMOTE)
final_pipe = SkPipeline(
    steps=[
        ("preprocess", preprocess_only),
        ("clf", clf_only)
    ]
)

# Fit this pipeline on TRAINING DATA — WITHOUT RESAMPLING
final_pipe.fit(X_train, y_train)


In [5]:
import joblib
joblib.dump(final_pipe, "mule_fraud_model.pkl")

['mule_fraud_model.pkl']

In [7]:
import joblib
import pandas as pd

model = joblib.load("mule_fraud_model.pkl")

try:
    # For sklearn pipelines
    print(model.named_steps["preprocess"].get_feature_names_out())
except:
    # For models trained without pipelines
    try:
        print(model.feature_names_in_)
    except:
        print("Could not extract feature list. Tell ChatGPT your model type.")


['num__age' 'num__account_tenure_months' 'num__avg_monthly_balance'
 'num__total_inflow_24hr' 'num__count_inflow_24hr'
 'num__count_unique_creditors_24hr' 'num__total_outflow_24hr'
 'num__count_outflow_24hr' 'num__time_diff_first_inflow_to_outflow'
 'num__percent_inflow_cashed_out_1hr' 'num__velocity_inflow_1hr'
 'num__velocity_outflow_1hr' 'num__txn_amount' 'num__txn_hour'
 'nom__city_Ahmedabad' 'nom__city_Bengaluru' 'nom__city_Chennai'
 'nom__city_Delhi' 'nom__city_Hyderabad' 'nom__city_Jaipur'
 'nom__city_Kolkata' 'nom__city_Mumbai' 'nom__city_Other' 'nom__city_Pune'
 'nom__merchant_category_Crypto' 'nom__merchant_category_Electronics'
 'nom__merchant_category_Entertainment' 'nom__merchant_category_Food'
 'nom__merchant_category_Gaming' 'nom__merchant_category_Groceries'
 'nom__merchant_category_Other' 'nom__merchant_category_Travel'
 'nom__merchant_category_Utility' 'nom__merchant_category_Wallet'
 'ord__kyc_type' 'bin__device_change_last_48hr'
 'bin__new_payee_added_last_7d' 'bin_

In [9]:
len(X.columns)

20