In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost



# 1. Import Library

In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# 2. Dataset

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Mercor Fraud Detection/mercor-fraud-detection/Graph_train.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Mercor Fraud Detection/mercor-fraud-detection/Graph_test.csv")

In [None]:
df_train = df_train.loc[df_train['is_fraud'].notna()]

In [None]:
X_train = df_train.drop(columns=["user_hash", "is_fraud", "high_conf_nf"])
y_train = df_train["is_fraud"]

X_test = df_test.drop(columns=["user_hash"])

In [None]:
feature_order = X_train.columns.tolist()

X_train = X_train[feature_order]
X_test = X_test[feature_order]

In [None]:
scaler = StandardScaler()

In [None]:
X_train.shape, y_train.shape

((112966, 93), (112966,))

# 3. Stacking

## 3.1 LightGBM, XGBoost, CatBoost

In [None]:
lgb_params_cpu = {
    "n_estimators": 800,
    "learning_rate": 0.02,
    "num_leaves": 64,
    "n_jobs": -1
}

xgb_params_cpu = {
    "n_estimators": 800,
    "learning_rate": 0.02,
    "max_depth": 7,
    "verbosity": 0,
    "n_jobs": -1
}

cat_params_cpu = {
    "iterations": 800,
    "learning_rate": 0.02,
    "depth": 7,
    "verbose": 0
}

In [None]:
base_models = []
base_models.append(("lgbm",lgb_params_cpu))
base_models.append(("xgb", xgb_params_cpu))
base_models.append(("catboost", cat_params_cpu))

In [None]:
NFOLDS = 5
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

oof_mat = np.zeros((len(X_train), len(base_models)))
test_mat = np.zeros((len(X_test), len(base_models)))

In [None]:
for i, (name, params) in enumerate(base_models):
    print(f"\n=== Training {name} ===")

    # Instantiate model
    if name == "lgbm":
        model = LGBMClassifier(**params)
    elif name == "xgb":
        model = XGBClassifier(**params)
    elif name == "catboost":
        model = CatBoostClassifier(**params)
    else:
        raise ValueError(f"Unknown model: {name}")

    # K-Fold
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
        X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train.iloc[val_idx]

        # Fit model
        if name == "catboost":
            model.fit(
                X_tr, y_tr,
                eval_set=(X_val_fold, y_val_fold),
                verbose=False
            )
        elif name == "lgbm":
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val_fold, y_val_fold)]
            )
        else: # For XGBoost
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val_fold, y_val_fold)],
                verbose=False
            )

        # Predict OOF
        oof_mat[val_idx, i] = model.predict_proba(X_val_fold)[:, 1]

        # Predict test
        test_mat[:, i] += model.predict_proba(X_test)[:, 1] / NFOLDS

        print(f"  Fold {fold+1} done.")

    # Compute OOF AUC
    auc = roc_auc_score(y_train, oof_mat[:, i])
    print(f"  OOF AUC [{name}]: {auc:.5f}")


=== Training lgbm ===
[LightGBM] [Info] Number of positive: 27544, number of negative: 62828
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.293288 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12836
[LightGBM] [Info] Number of data points in the train set: 90372, number of used features: 93
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.304785 -> initscore=-0.824616
[LightGBM] [Info] Start training from score -0.824616
  Fold 1 done.
[LightGBM] [Info] Number of positive: 27545, number of negative: 62828
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12717
[LightGBM] [Info] Number of data points in the train set: 90373, number

## 3.2 Meta Model

In [None]:
meta_params = {
    "n_estimators": 1000,
    "learning_rate": 0.03,
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "eval_metric": "auc",
    "use_label_encoder": False,
    "verbosity": 0
}

In [None]:
meta_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
meta_model_full = XGBClassifier(**meta_params)
meta_model_full.fit(oof_mat, y_train)
meta_test = meta_model_full.predict_proba(test_mat)[:, 1]

meta_oof = np.zeros(len(y_train))
for tr_idx, val_idx in meta_skf.split(oof_mat, y_train):
    meta_cv = XGBClassifier(**meta_params)
    meta_cv.fit(oof_mat[tr_idx], y_train.iloc[tr_idx])
    meta_oof[val_idx] = meta_cv.predict_proba(oof_mat[val_idx])[:, 1]

In [None]:
mean_base_test = np.mean(test_mat, axis=1)
final_prediction = 0.6 * meta_test + 0.4 * mean_base_test

In [None]:
mean_base_oof = np.mean(oof_mat, axis=1)
oof_final_prediction = 0.6 * meta_oof + 0.4 * mean_base_oof

# 4. Evaluation

In [None]:
def compute_total_cost(y_true, p_pred, t1, t2):

    y_true = np.asarray(y_true)
    p_pred = np.asarray(p_pred)

    auto_pass = p_pred < t1
    manual = (p_pred >= t1) & (p_pred < t2)
    auto_block = p_pred >= t2

    cost = 0

    cost += np.sum((y_true == 1) & auto_pass) * 600

    cost += np.sum((y_true == 1) & manual) * 5

    cost += np.sum((y_true == 0) & manual) * 150

    cost += np.sum((y_true == 0) & auto_block) * 300

    return cost

In [None]:
def search_best_thresholds(y_true, p_pred, t1_grid=np.linspace(0.01, 0.5, 50), t2_grid=np.linspace(0.5, 0.99, 50)):
  best_cost = np.inf
  best_t1, best_t2 = None, None
  for t1 in t1_grid:
    for t2 in t2_grid:
      if t1 >= t2:
        continue
      cost = compute_total_cost(y_true, p_pred, t1, t2)
      if cost < best_cost:
        best_cost = cost
        best_t1, best_t2 = t1, t2
  return best_cost, best_t1, best_t2

In [None]:
def decision_counts(y_true, p_pred, t1, t2):
    return {
        "FN": np.sum((y_true == 1) & (p_pred < t1)),
        "TP_manual": np.sum((y_true == 1) & (p_pred >= t1) & (p_pred < t2)),
        "FP_manual": np.sum((y_true == 0) & (p_pred >= t1) & (p_pred < t2)),
        "FP_autoblock": np.sum((y_true == 0) & (p_pred >= t2)),
    }

In [None]:
best_cost, t1, t2 = search_best_thresholds(y_train, oof_final_prediction)

In [None]:
decision_counts(y_train, oof_final_prediction, t1, t2)

{'FN': np.int64(5632),
 'TP_manual': np.int64(20353),
 'FP_manual': np.int64(25849),
 'FP_autoblock': np.int64(55)}

In [None]:
5632 * 600 + 20353 * 5 + 25849 * 150 + 55 * 300

7374815

# 5. Submission

In [None]:
submission = pd.DataFrame({
    "user_hash": df_test["user_hash"].values,
    "prediction": final_prediction
})

submission.to_csv("submission.csv", index=False)