# Train machine learning model

This notebook outlines a workflow for training a machine learning model with the goal of identifying optimal hyperparameters. The `UCI Credit Card Client Default` [dataset](https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients) will be used to develop a machine learning model to predict the liklihood of credit default.


#### Import dependencies, define notebook parameters and constants


In [None]:
import json
from typing import Dict, Tuple, Union

import mlflow
import pandas as pd
from hyperopt import STATUS_OK, fmin, hp, tpe
from mlflow.models.signature import infer_signature
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [None]:
# define notebook parameters
dbutils.widgets.text("experiment_name", "/online-inference-containers-examples")

dbutils.widgets.text(
    "curated_dataset_table", "hive_metastore.default.credit_default_uci_curated"
)

In [None]:
# define target column
TARGET = ["default_payment_next_month"]

# define categorical feature columns
CATEGORICAL_FEATURES = [
    "sex",
    "education",
    "marriage",
    "repayment_status_1",
    "repayment_status_2",
    "repayment_status_3",
    "repayment_status_4",
    "repayment_status_5",
    "repayment_status_6",
]

# define numeric feature columns
NUMERIC_FEATURES = [
    "credit_limit",
    "age",
    "bill_amount_1",
    "bill_amount_2",
    "bill_amount_3",
    "bill_amount_4",
    "bill_amount_5",
    "bill_amount_6",
    "payment_amount_1",
    "payment_amount_2",
    "payment_amount_3",
    "payment_amount_4",
    "payment_amount_5",
    "payment_amount_6",
]

# define all features
FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES

#### Define functions to build the model


In [None]:
def make_classifer_pipeline(params: Dict[str, Union[str, int]]) -> Pipeline:
    """Create sklearn pipeline to apply transforms and a final estimator"""
    # categorical features transformations
    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
            (
                "ohe",
                OneHotEncoder(
                    handle_unknown="ignore",
                ),
            ),
        ]
    )

    # numeric features transformations
    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="median"))]
    )

    # preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ("categorical", categorical_transformer, CATEGORICAL_FEATURES),
            ("numeric", numeric_transformer, NUMERIC_FEATURES),
        ]
    )

    # model training pipeline
    classifer_pipeline = Pipeline(
        [
            ("preprocessor", preprocessor),
            ("classifier", RandomForestClassifier(**params, n_jobs=-1)),
        ]
    )

    return classifer_pipeline

In [None]:
# define objective function
def hyperparameter_tuning(params):
    mlflow.sklearn.autolog(silent=True)

    with mlflow.start_run(nested=True):
        # read and process curated data
        df = spark.read.table(dbutils.widgets.get("curated_dataset_table")).toPandas()

        # split into train and test datasets
        df_train, df_test = train_test_split(
            df[CATEGORICAL_FEATURES + NUMERIC_FEATURES + TARGET],
            test_size=0.20,
            random_state=2024,
        )

        # seperate features and target variables
        x_train, y_train = (
            df_train[CATEGORICAL_FEATURES + NUMERIC_FEATURES],
            df_train[TARGET],
        )
        x_test, y_test = (
            df_test[CATEGORICAL_FEATURES + NUMERIC_FEATURES],
            df_test[TARGET],
        )

        # train model
        estimator = make_classifer_pipeline(params)
        estimator.fit(x_train, y_train.values.ravel())

        # train and model
        estimator = make_classifer_pipeline(params)
        estimator = estimator.fit(x_train, y_train.values.ravel())
        y_predict_proba = estimator.predict_proba(x_test)

        # train model
        estimator = make_classifer_pipeline(params)
        estimator.fit(x_train, y_train.values.ravel())

        # calculate evaluation metrics
        y_pred = estimator.predict(x_test)
        validation_accuracy_score = accuracy_score(y_test.values.ravel(), y_pred)
        validation_roc_auc_score = roc_auc_score(y_test.values.ravel(), y_pred)
        validation_f1_score = f1_score(y_test.values.ravel(), y_pred)
        validation_precision_score = precision_score(y_test.values.ravel(), y_pred)
        validation_recall_score = recall_score(y_test.values.ravel(), y_pred)

        # log evaluation metrics
        mlflow.log_metric("validation_accuracy_score", validation_accuracy_score)
        mlflow.log_metric("validation_roc_auc_score", validation_roc_auc_score)
        mlflow.log_metric("validation_f1_score", validation_f1_score)
        mlflow.log_metric("validation_precision_score", validation_precision_score)
        mlflow.log_metric("validation_recall_score", validation_recall_score)

        # log model
        signature = infer_signature(x_train, y_pred)
        mlflow.sklearn.log_model(
            estimator,
            "model",
            signature=signature,
            input_example=x_test.iloc[0].to_dict(),
        )

        return {"loss": -validation_roc_auc_score, "status": STATUS_OK}

In [None]:
def train_model():
    # set mlflow tracking uri
    mlflow_client = mlflow.tracking.MlflowClient(tracking_uri="databricks")
    mlflow.set_tracking_uri("databricks")

    # start model training run
    mlflow.set_experiment(dbutils.widgets.get("experiment_name"))
    with mlflow.start_run(run_name="credit-default-uci-train") as run:
        # define search space
        search_space = {
            "n_estimators": hp.choice("n_estimators", range(100, 1000)),
            "max_depth": hp.choice("max_depth", range(1, 25)),
            "criterion": hp.choice("criterion", ["gini", "entropy"]),
        }

        # hyperparameter tuning
        best_params = fmin(
            fn=hyperparameter_tuning,
            space=search_space,
            algo=tpe.suggest,
            max_evals=10,
        )

        # end run
        mlflow.end_run()

        return run

#### Train the machine learning model


In [None]:
# train model
run = train_model()

# retreive model from best run
best_run = mlflow.search_runs(
    filter_string=f"tags.mlflow.parentRunId='{run.info.run_id}'",
    order_by=["metrics.validation_roc_auc_score DESC"],
).iloc[0]

#### Return notebook outputs


In [None]:
# return notebook output
json_output = json.dumps({"output": {"BEST_RUN_ID": best_run.run_id}})

dbutils.notebook.exit(json_output)