In [1]:
# === Imports ===
import pandas as pd, numpy as np, json
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import warnings; warnings.filterwarnings("ignore")

READY = Path("../data/ready")
label_col = "Is Laundering"

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=200, n_jobs=-1, class_weight="balanced"),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM (RBF)": SVC(kernel="rbf", probability=True, class_weight="balanced"),
}

for file in ["HI_Small_ready.parquet","LI_Small_ready.parquet","HI_Medium_ready.parquet","LI_Medium_ready.parquet"]:
    print(f"\n📊 Dataset → {file}")
    df = pd.read_parquet(READY / file)
    X, y = df.drop(columns=[label_col]), df[label_col]
    Xtr, Xval, ytr, yval = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    sc = StandardScaler(); Xtr = sc.fit_transform(Xtr); Xval = sc.transform(Xval)

    results = {}
    for name, mdl in models.items():
        mdl.fit(Xtr, ytr)
        p = mdl.predict(Xval)
        proba = mdl.predict_proba(Xval)[:,1] if hasattr(mdl,"predict_proba") else p
        results[name] = {
            "roc_auc": roc_auc_score(yval, proba),
            "report": classification_report(yval, p, digits=4, output_dict=True),
            "confusion": confusion_matrix(yval, p).tolist()
        }
        print(f"{name}: ROC-AUC={results[name]['roc_auc']:.4f}")

    out = READY / f"baseline_results_{file.replace('.parquet','.json')}"
    with open(out,"w") as f: json.dump(results,f,indent=2)
    print("✅ Saved:",out)



📊 Dataset → HI_Small_ready.parquet


FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\ready\\HI_Small_ready.parquet'

In [1]:
from pathlib import Path
READY = Path("../data/ready")
list(READY.glob("*_ready.parquet"))


[WindowsPath('../data/ready/HI_Medium_ready.parquet'),
 WindowsPath('../data/ready/HI_Small_ready.parquet'),
 WindowsPath('../data/ready/LI_Medium_ready.parquet'),
 WindowsPath('../data/ready/LI_Small_ready.parquet')]