In [None]:
# =========================
# CELL 1 – Imports & paths
# =========================

import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score
)

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

import joblib

DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(exist_ok=True)
import os

for root, dirs, files in os.walk("/", topdown=True):
    for f in files:
        if "provider_level_dataset" in f.lower():
            print(os.path.join(root, f))



In [None]:
import os

for root, dirs, files in os.walk("/", topdown=True):
    for f in files:
        if "provider_level_dataset" in f.lower():
            print(os.path.join(root, f))


In [None]:
# ================================
# CELL 2 – Load provider-level data
# ================================

provider_df = pd.read_csv(DATA_DIR / "/content/provider_level_dataset.csv")
provider_df.head()


Unnamed: 0,InscClaimAmtReimbursed_sum,InscClaimAmtReimbursed_mean,InscClaimAmtReimbursed_max,ClaimDuration_days_mean,ClaimDuration_days_max,is_inpatient_sum,is_outpatient_sum,BeneID_nunique,ClaimID_nunique,total_claims,unique_beneficiaries,inpatient_share,outpatient_share,claims_per_beneficiary,is_fraud
0,104640,4185.6,42000,1.44,14,5,20,24,25,25,24,0.2,0.8,1.041667,0
1,605670,4588.409091,57000,3.674242,27,62,70,117,132,132,117,0.469697,0.530303,1.128205,1
2,52170,350.134228,3300,1.42953,20,0,149,138,149,149,138,0.0,1.0,1.07971,0
3,280910,241.124464,4080,1.088412,20,0,1165,495,1165,1165,495,0.0,1.0,2.353535,1
4,33710,468.194444,10000,0.958333,20,3,69,58,72,72,58,0.041667,0.958333,1.241379,0


In [None]:
import os

print("Current Directory:", os.getcwd())
print("Files here:", os.listdir())


Current Directory: /content
Files here: ['.config', 'sample_data']


In [None]:
# =======================================
# CELL 3 – Train / test split (provider)
# =======================================

# Target
y = provider_df["is_fraud"].astype(int)

# Features: drop target + any identifier columns
id_cols = [c for c in provider_df.columns if c.lower() in {"provider"}]
X = provider_df.drop(columns=["is_fraud"] + id_cols, errors="ignore")

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)


Train shape: (4328, 14) Test shape: (1082, 14)


(is_fraud
 0    0.906423
 1    0.093577
 Name: proportion, dtype: float64,
 is_fraud
 0    0.906654
 1    0.093346
 Name: proportion, dtype: float64)

In [None]:
# =========================================
# CELL 4 – Helper: evaluation on a split
# =========================================

def evaluate_model(clf, X_tr, y_tr, X_te, y_te, model_name="model"):
    """
    Returns a dictionary of core metrics plus prints them.
    """
    y_pred = clf.predict(X_te)
    y_proba = clf.predict_proba(X_te)[:, 1]

    metrics = {
        "model": model_name,
        "precision": precision_score(y_te, y_pred, zero_division=0),
        "recall": recall_score(y_te, y_pred, zero_division=0),
        "f1": f1_score(y_te, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y_te, y_proba),
        "pr_auc": average_precision_score(y_te, y_proba),
    }

    print(f"\n=== {model_name} ===")
    for k, v in metrics.items():
        if k != "model":
            print(f"{k}: {v:.4f}")
    return metrics


In [None]:
# =====================================
# CELL 5 – Define base pipelines & grids
# =====================================

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

class_ratio = y_train.value_counts()
pos_weight = class_ratio[0] / class_ratio[1]  # for XGBoost

models = {}

# Logistic Regression
log_reg_pipe = Pipeline(steps=[
    ("smote", SMOTE(random_state=42)),
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", n_jobs=-1))
])

log_reg_param_dist = {
    "clf__C": np.logspace(-2, 2, 10),
    "clf__penalty": ["l2"],
    "clf__solver": ["lbfgs", "liblinear"],
}

models["logistic_regression"] = (log_reg_pipe, log_reg_param_dist)

# Random Forest
rf_pipe = Pipeline(steps=[
    ("smote", SMOTE(random_state=42)),
    ("clf", RandomForestClassifier(
        class_weight="balanced",
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ))
])

rf_param_dist = {
    "clf__n_estimators": [200, 300, 500],
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": [2, 5, 10],
    "clf__min_samples_leaf": [1, 2, 4],
    "clf__max_features": ["sqrt", "log2", 0.5],
}

models["random_forest"] = (rf_pipe, rf_param_dist)

# XGBoost
xgb_pipe = Pipeline(steps=[
    ("smote", SMOTE(random_state=42)),
    ("clf", XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42,
        n_estimators=300,
        tree_method="hist",
        use_label_encoder=False,
        scale_pos_weight=pos_weight,
        n_jobs=-1
    ))
])

xgb_param_dist = {
    "clf__max_depth": [3, 4, 5, 6],
    "clf__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "clf__subsample": [0.6, 0.8, 1.0],
    "clf__colsample_bytree": [0.6, 0.8, 1.0],
    "clf__gamma": [0, 0.5, 1],
    "clf__reg_lambda": [1, 5, 10],
}

models["xgboost"] = (xgb_pipe, xgb_param_dist)


In [None]:
# ============================================
# CELL 6 – Faster hyperparameter search
# ============================================

results = []
best_models = {}

for name, (pipe, param_dist) in models.items():
    print(f"\n\n######## Tuning {name} ########")

    # make Random Forest & XGBoost lighter
    if name in ["random_forest", "xgboost"]:
        n_iter = 8      # fewer combinations
        cv_local = 3    # fewer CV folds
    else:
        n_iter = 15
        cv_local = 5

    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring="f1",
        n_jobs=-1,
        cv=cv_local,
        verbose=1,
        random_state=42,
    )
    search.fit(X_train, y_train)

    print(f"Best params for {name}:", search.best_params_)
    best_model = search.best_estimator_
    best_models[name] = best_model

    # Evaluate on hold-out test set
    metrics = evaluate_model(best_model, X_train, y_train, X_test, y_test, model_name=name)
    results.append(metrics)




######## Tuning logistic_regression ########
Fitting 5 folds for each of 15 candidates, totalling 75 fits




Best params for logistic_regression: {'clf__solver': 'liblinear', 'clf__penalty': 'l2', 'clf__C': np.float64(1.6681005372000592)}

=== logistic_regression ===
precision: 0.4537
recall: 0.9208
f1: 0.6078
roc_auc: 0.9621
pr_auc: 0.7757


######## Tuning random_forest ########
Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best params for random_forest: {'clf__n_estimators': 300, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 1, 'clf__max_features': 'log2', 'clf__max_depth': 10}

=== random_forest ===
precision: 0.5153
recall: 0.8317
f1: 0.6364
roc_auc: 0.9632
pr_auc: 0.7650


######## Tuning xgboost ########
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best params for xgboost: {'clf__subsample': 0.8, 'clf__reg_lambda': 5, 'clf__max_depth': 4, 'clf__learning_rate': 0.2, 'clf__gamma': 0, 'clf__colsample_bytree': 0.6}

=== xgboost ===
precision: 0.4511
recall: 0.8218
f1: 0.5825
roc_auc: 0.9522
pr_auc: 0.7394


In [None]:
# ====================================
# CELL 7 – Compare model performances
# ====================================

results_df = pd.DataFrame(results).set_index("model")
results_df


Unnamed: 0_level_0,precision,recall,f1,roc_auc,pr_auc
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
logistic_regression,0.453659,0.920792,0.607843,0.962142,0.775664
random_forest,0.515337,0.831683,0.636364,0.963212,0.764987
xgboost,0.451087,0.821782,0.582456,0.952201,0.7394


In [None]:
# ======================================
# CELL 8 – Save best models to disk
# ======================================

for name, model in best_models.items():
    path = MODEL_DIR / f"{name}_best.joblib"
    joblib.dump(model, path)
    print(f"Saved {name} to {path}")


Saved logistic_regression to ../models/logistic_regression_best.joblib
Saved random_forest to ../models/random_forest_best.joblib
Saved xgboost to ../models/xgboost_best.joblib


In [21]:
import os

print("MODEL_DIR:", MODEL_DIR)
print("Files in MODEL_DIR:", os.listdir(MODEL_DIR))


MODEL_DIR: ../models
Files in MODEL_DIR: ['xgboost_best.joblib', 'logistic_regression_best.joblib', 'random_forest_best.joblib']


In [22]:
from pathlib import Path
import joblib
import os

# Save models to a fixed folder in this runtime
MODEL_DIR = Path("/content/models")
MODEL_DIR.mkdir(exist_ok=True)

for name, model in best_models.items():
    save_path = MODEL_DIR / f"{name}_best.joblib"
    joblib.dump(model, save_path)
    print("Saved:", save_path)

print("Files now in /content/models:", os.listdir("/content/models"))


Saved: /content/models/logistic_regression_best.joblib
Saved: /content/models/random_forest_best.joblib
Saved: /content/models/xgboost_best.joblib
Files now in /content/models: ['xgboost_best.joblib', 'logistic_regression_best.joblib', 'random_forest_best.joblib']
