In [0]:
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import mlflow
from datetime import datetime
from sklearn.model_selection import cross_validate, StratifiedKFold
from dataikuapi.dss.ml import DSSPredictionMLTaskSettings
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline

import optuna

from catboost import CatBoostClassifier, Pool

In [0]:
# Replace these constants by your own values
XP_TRACKING_FOLDER_ID = "37rtxeoO"
MLFLOW_EXPERIMENT_NAME = "credit-card-transaction-fraud-mlflow-exp"
MLFLOW_CODE_ENV_NAME = "py_38_scoring"
SAVED_MODEL_NAME = "credit-card-transaction-fraud-classifier-mlflow"
EVALUATION_DATASET = "transactions_known_test"
MODEL_NAME = "catboost"

In [0]:
# Read recipe inputs
transactions_known_train = dataiku.Dataset("transactions_known_train")
train_df = transactions_known_train.get_dataframe()
transactions_known_test = dataiku.Dataset("transactions_known_test")
test_df = transactions_known_test.get_dataframe()

In [0]:
# Choose features to include in the model
columns_to_inlcude = ['authorized_flag', 'purchase_dow', 'purchase_weekend',
                      'item_category', 'purchase_amount', 'signature_provided',
                      'card_reward_program', 'card_latitude', 'card_longitude', 'card_fico_score', 'card_age',
                      'merchant_subsector_description', 'merchant_latitude', 'merchant_longitude', 'purchase_amount_abs_z_score']

In [0]:
columns_to_ignore = [col for col in train_df.columns if col not in columns_to_inlcude]

In [0]:
# This section will sync the MLFlow experiments with Dataiku
mlflow_model_cc_transaction_fraud_folder = dataiku.Folder(XP_TRACKING_FOLDER_ID)
client = dataiku.api_client()
project = client.get_default_project()

mlflow_extension = project.get_mlflow_extension()
mlflow_handle = project.setup_mlflow(managed_folder=mlflow_model_cc_transaction_fraud_folder)

mlflow.set_experiment(experiment_name=MLFLOW_EXPERIMENT_NAME)
mlflow_experiment = mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

In [0]:
# Name the run with current timestamp
def now_str() -> str:
    return datetime.now().strftime("%Y_%m_%d_%H_%M")

run_name = f"{MODEL_NAME}_{now_str()}"

In [0]:
# Catboost likes feature-type (categorical, numeric) indices
nonint_features_indices = np.where((train_df.dtypes != np.int))[0]
nonfloat_features_indices = np.where((train_df.dtypes != np.float))[0]
categorical_features_indices = [value for value in nonint_features_indices if value in nonfloat_features_indices]

In [0]:
# Define the Optuna objective function for hyperparameter tuning
# MLFlow run within this function
def objective(trial):
    with mlflow.start_run(run_name=run_name) as run:
        run_id = run.info.run_id

        mlflow.set_tag("model", "catboost")
        mlflow.set_tag("stage", "optuna hyperparam tuning")
        mlflow.set_tag("run_name", run_name)

        X = train_df.drop('authorized_flag', axis=1)
        y = train_df['authorized_flag']

        nonint_features_indices = np.where((X.dtypes != np.int))[0]
        nonfloat_features_indices = np.where((X.dtypes != np.float))[0]
        categorical_features_indices = [value for value in nonint_features_indices if value in nonfloat_features_indices]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        train_data = Pool(data=X_train, label=y_train, cat_features=categorical_features_indices)
        test_data = Pool(data=X_test, label=y_test, cat_features=categorical_features_indices)

        # Hyperparameters space here
        param = {
            "objective": "Logloss",
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.7, 0.9),
            "depth": trial.suggest_int("depth", 1, 12),
            "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
            "bootstrap_type": trial.suggest_categorical(
                "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
            ),
            "auto_class_weights": trial.suggest_categorical("auto_class_weights", ["Balanced","SqrtBalanced"]),
            "ignored_features": columns_to_ignore,
            "iterations":10,
            "used_ram_limit":"6gb"

        }

        # Use MLFlow to log chosen parameters
        mlflow.log_params(param)
        if param["bootstrap_type"] == "Bayesian":
            param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
        elif param["bootstrap_type"] == "Bernoulli":
            param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

        cat_cls = CatBoostClassifier(**param)
        cat_cls.fit(train_data, eval_set = test_data, verbose=0)
        mlflow.catboost.log_model(cat_cls, artifact_path=f"{run_name}")

        preds = cat_cls.predict(X_test)
        pred_labels = np.rint(preds)
        roc_auc = round(roc_auc_score(y_test, pred_labels),4)
        accuracy = round(accuracy_score(y_test, pred_labels),4)

        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("accuracy", accuracy)

        return roc_auc

In [0]:
# Kick off the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15, timeout=600)

run_name_final = run_name + "_final"

In [0]:
# Print the best parameters found by the study
print("Best params")
print(study.best_params)

In [0]:
# Run one more MLFlow experiment to train the model on the full train/test sets
# using the best parameters
with mlflow.start_run(run_name=run_name_final) as run:
    run_id = run.info.run_id

    mlflow.set_tag("model", "catboost")
    mlflow.set_tag("stage", "final train")
    mlflow.set_tag("run_name", run_name_final)

    X_train = train_df.drop('authorized_flag', axis=1)
    y_train = train_df['authorized_flag']

    nonint_features_indices = np.where((X_train.dtypes != np.int))[0]
    nonfloat_features_indices = np.where((X_train.dtypes != np.float))[0]
    categorical_features_indices = [value for value in nonint_features_indices if value in nonfloat_features_indices]

    X_test = test_df.drop('authorized_flag', axis=1)
    y_test = test_df['authorized_flag']

    train_data = Pool(data=X_train, label=y_train, cat_features=categorical_features_indices)
    test_data = Pool(data=X_test, label=y_test, cat_features=categorical_features_indices)

    mlflow.log_params(study.best_params)

    cat_cls = CatBoostClassifier(**study.best_params,eval_metric='AUC',iterations=10)
    cat_cls.fit(train_data, eval_set = test_data)
    mlflow.catboost.log_model(cat_cls, artifact_path=f"{run_name_final}")

    preds = cat_cls.predict(X_test)
    pred_labels = np.rint(preds)
    roc_auc = round(roc_auc_score(y_test, preds),4)
    accuracy = round(accuracy_score(y_test, preds),4)

    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("accuracy", accuracy)

In [0]:
# Get the MLFlow details of the final, best trained model
experiment_id = mlflow_experiment.experiment_id
experiment_results_df = mlflow.search_runs(experiment_id)

In [0]:
experiment_results_df

In [0]:
latest_run_results_df = experiment_results_df[experiment_results_df['tags.run_name'] == run_name_final]
best_run_id = latest_run_results_df.iloc[0]['run_id']
model_path = f"credit_card_transac/{best_run_id}/artifacts/{run_name_final}"

In [0]:
# Create a new Dataiku Saved Model (if doesn't exist already)
sm_id = None
for sm in project.list_saved_models():
    if sm["name"] != SAVED_MODEL_NAME:
        continue
    else:
        sm_id = sm["id"]
        print(f"Found Saved Model {sm['name']} with id {sm['id']}")
        break

if sm_id:
    sm = project.get_saved_model(sm_id)
else:
    sm = project.create_mlflow_pyfunc_model(name=SAVED_MODEL_NAME,
                                            prediction_type=DSSPredictionMLTaskSettings.PredictionTypes.BINARY)
    sm_id = sm.id
    print(f"Saved Model not found, created new one with id {sm_id}")

In [0]:
# Import the final trained model into the Dataiku Saved Model (Green Diamond)
mlflow_version = sm.import_mlflow_version_from_managed_folder(version_id=run_name_final,
                                                              managed_folder=XP_TRACKING_FOLDER_ID,
                                                              path=model_path,
                                                              code_env_name=MLFLOW_CODE_ENV_NAME)

# Make this Saved Model version the active one
sm.set_active_version(mlflow_version.version_id)

In [0]:
# Set model metadata (target name, classes,...)
mlflow_version.set_core_metadata('authorized_flag', [0, 1] , get_features_from_dataset=EVALUATION_DATASET)

In [0]:
# Evaluate the performance of this new version, to populate the performance screens of the saved model version in DSS
mlflow_version.evaluate(EVALUATION_DATASET)