In [1]:
# Import libraries
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from joblib import dump


In [2]:
# Load preprocessed data
data = pd.read_csv("../data/processed/data_labeled.csv")


In [3]:
data = data.drop(columns=[
    'TransactionId', 'BatchId', 'AccountId', 
    'SubscriptionId', 'CustomerId', 'FraudResult','CountryCode','CurrencyCode'
])


In [4]:
data.head()

Unnamed: 0,Amount,Value,TotalTransactionAmount,AverageTransactionAmount,TransactionCount,TransactionAmountStdDev,hour_sin,hour_cos,day_sin,day_cos,...,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4,is_high_risk
0,-0.046371,-0.072291,0.532237,-0.050092,-0.718149,-0.146743,0.5,0.866025,0.101168,-0.994869,...,-0.143114,-0.075205,-0.796656,0.82474,-0.105245,-0.063568,-0.141085,0.44503,-0.406434,0
1,-0.054643,-0.080251,-0.546125,-0.087889,1.444841,-0.161439,0.5,0.866025,0.101168,-0.994869,...,-0.143114,-0.075205,1.255247,-1.212503,-0.105245,-0.063568,-0.141085,0.44503,-0.406434,0
2,-0.050426,-0.076352,0.526214,-0.071753,-0.722639,-0.182299,0.5,0.866025,0.101168,-0.994869,...,-0.143114,-0.075205,-0.796656,0.82474,-0.105245,-0.063568,-0.141085,0.44503,-0.406434,1
3,0.107717,0.096648,0.535874,0.033881,-0.720955,0.040415,0.707107,0.707107,0.101168,-0.994869,...,6.987414,-0.075205,-0.796656,0.82474,-0.105245,-0.063568,-0.141085,0.44503,-0.406434,0
4,-0.059704,-0.075183,-0.546125,-0.087889,1.444841,-0.161439,0.707107,0.707107,0.101168,-0.994869,...,-0.143114,-0.075205,1.255247,-1.212503,-0.105245,-0.063568,-0.141085,0.44503,-0.406434,0


In [5]:
# Separate features and target
X = data.drop("is_high_risk", axis=1)
y = data["is_high_risk"]

In [6]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Models with hyperparameter grids for tuning
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

models = {
    "RandomForestClassifier": {
        "model": RandomForestClassifier(),
        "params": {
            "clf__n_estimators": [50, 100],
            "clf__max_depth": [None, 10, 20]
        }
    },
    "GradientBoosting": {
        "model": GradientBoostingClassifier(),
        "params": {
            "clf__n_estimators": [100, 200],
            "clf__learning_rate": [0.05, 0.1],
            "clf__max_depth": [3, 5]
        }
    },
    "XGBoost": {
        "model": XGBClassifier(use_label_encoder=False, eval_metric='logloss',class_weight='balanced'),
        "params": {
            "clf__n_estimators": [100, 200],
            "clf__max_depth": [3, 5, 7],
            "clf__learning_rate": [0.05, 0.1]
        }
    },
}



In [17]:
best_model = None
best_score = 0
best_model_name = ""

# Loop through each model config
for model_name, config in models.items():
    # Create pipeline with scaling + model
    pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", config["model"])
    ])

    # Grid Search with 5-fold CV
    clf = GridSearchCV(pipeline, config["params"], cv=5, n_jobs=-1, scoring='recall')

    # Start MLflow run
    with mlflow.start_run(run_name=model_name):
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Log parameters and metrics to MLflow
        mlflow.log_param("model", model_name)
        mlflow.log_params(clf.best_params_)
        mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
        mlflow.log_metric("precision", precision_score(y_test, y_pred))
        mlflow.log_metric("recall", recall_score(y_test, y_pred))
        mlflow.log_metric("f1_score", f1_score(y_test, y_pred))
        mlflow.log_metric("roc_auc", roc_auc_score(y_test, y_pred))

        # Log model artifact
        mlflow.sklearn.log_model(clf.best_estimator_, model_name)

        # Save best model
        if accuracy_score(y_test, y_pred) > best_score:
            best_score = accuracy_score(y_test, y_pred)
            best_model = clf.best_estimator_
            best_model_name = model_name


Parameters: { "class_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [18]:
# Save the best-performing model
dump(best_model, f"{best_model_name}_best_model.joblib")
print(f"Saved best model: {best_model_name}")

Saved best model: XGBoost
