# Model Training Pipeline Development
- Author: Marcellinus Aditya Witarsah
- Date: 05 June 2024

In [1]:
# Imports
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import logging
import time
import pickle
import os
import joblib
from pathlib import Path
from abc import ABC
from abc import abstractmethod
from scipy import stats
from typing import Tuple
from typing import Union
from dataclasses import dataclass
from src.utils.common import logger
from src.utils.common import read_yaml, create_directories
from src.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH, PARAMS_FILE_PATH
from sklearn.linear_model import LogisticRegression
from optbinning import Scorecard
from optbinning import BinningProcess

(CVXPY) Jun 08 01:35:06 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.10.4067). Expected < 9.10.0. Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Jun 08 01:35:06 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.10.4067). Expected < 9.10.0. Please open a feature request on cvxpy to enable support for this version.')


In [2]:
# run once only
os.chdir("..")

# Configuration

In [3]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    """
    Data class for storing model training configuration.

    Attributes:
        root_dir (str): Root directory for model training.
        model_path (str): Path to save the trained model.
        train_data_path (str): Path to the training data.
        test_data_path (str): Path to the test data.
        experiment_name (str): Name of the experiment.
        run_name (str): Name of the run.
        target_column (str): The name of the target column.
        binning_process (dict): Configuration for the binning process.
        logistic_regression (dict): Configuration for logistic regression.
        scorecard (dict): Configuration for the scorecard.
    """
    root_dir: str
    model_path: str
    train_data_path: str
    test_data_path: str
    experiment_name: str
    run_name: str
    target_column: str
    binning_process: dict
    logistic_regression: dict
    scorecard: dict

In [4]:
# src/config/configuration_manager.py
from src.utils.common import read_yaml, create_directories
class ConfigurationManager:
    """
    Prepare ConfigurationManager class.
    
    This class is responsible for reading configuration files and preparing
    configuration settings for the pipeline.

    Attributes:
        config (dict): Parsed configuration file content.
        params (dict): Parsed parameters file content.
        schema (dict): Parsed schema file content.
    """
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH, 
        params_filepath: str = PARAMS_FILE_PATH, 
        schema_filepath: str = SCHEMA_FILE_PATH
    ):
        """
        Initialize the ConfigurationManager with file paths.

        Args:
            config_filepath (str): File path to the configuration YAML file.
            params_filepath (str): File path to the parameters YAML file.
            schema_filepath (str): File path to the schema YAML file.
        """
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        self.schema = read_yaml(Path(schema_filepath))
        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainingConfig:
        """
        Get configuration for model training.
        
        Returns:
            ModelTrainingConfig: Configuration for model training.
        """
        config = self.config.model_training
        params = self.params
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_training_config = ModelTrainingConfig(
            root_dir=config.root_dir,
            model_path=config.model_path,
            train_data_path=config.train_data_path,
            test_data_path=config.test_data_path,
            experiment_name=config.experiment_name,
            run_name=config.run_name,
            target_column=schema.name,
            binning_process=params.binning_process,
            logistic_regression=params.logistic_regression,
            scorecard=params.scorecard,
        )
        return model_training_config


In [5]:
configuration_manager = ConfigurationManager()
configuration_manager.get_model_training_config()

2024-06-08 13:35:13,457 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-08 13:35:13,468 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-08 13:35:13,471 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-08 13:35:13,471 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-08 13:35:13,472 - credit-scorecard-logger - INFO - Created directory at: artifacts/model_training


ModelTrainingConfig(root_dir='artifacts/model_training', model_path='artifacts/model_training/model.joblib', train_data_path='artifacts/data_preprocessing/train.csv', test_data_path='artifacts/data_preprocessing/test.csv', experiment_name='credit-scorecard', run_name='woe-logreg-scorecard-model', target_column='loan_status', binning_process=ConfigBox({'categorical_variables': ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], 'selection_criteria': {'iv': {'min': 0.02, 'max': 1}}}), logistic_regression=ConfigBox({'random_state': 42}), scorecard=ConfigBox({'scaling_method': 'pdo_odds', 'scaling_method_params': {'pdo': 20, 'odds': 1, 'scorecard_points': 500}, 'intercept_based': True}))

# Modelling

In [6]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc, precision_recall_curve
from scipy.stats import ks_2samp

def roc_auc(y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
    """
    Calculate ROC AUC (Area Under the Receiver Operating Characteristic Curve).
    
    Args:
        y_true (Union[list, np.array]): True labels.
        y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
    Returns:
        float: ROC AUC score.
    """
    return roc_auc_score(y_true, y_pred_proba)

def pr_auc(y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
    """
    Calculate PR AUC (Area Under the Precision Recall Curve).
    
    Args:
        y_true (Union[list, np.array]): True labels.
        y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
    Returns:
        float: PR AUC score.
    """
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    return auc(recall, precision)

def gini(y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
    """
    Calculate Gini coefficient.

    Args:
        y_true (Union[list, np.array]): True labels.
        y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
    Returns:
        float: Gini coefficient.
    """
    return 2 * roc_auc_score(y_true, y_pred_proba) - 1

def ks(y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
    """
    Calculate Kolmogorov-Smirnov (KS) statistic.

    Args:
        y_true (Union[list, np.array]): True labels.
        y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
    Returns:
        float: KS statistic.
    """
    y_pred_proba_not_default = y_pred_proba[y_true == 0]
    y_pred_proba_default = y_pred_proba[y_true == 1]
    ks_stat, _ = ks_2samp(y_pred_proba_not_default, y_pred_proba_default)
    return ks_stat

In [7]:
from sklearn.calibration import calibration_curve
def plot_calibration_curve(y_true: np.array, y_pred_proba: np.array, model_name: str, figsize: Tuple[int, int], path: Path, n_bins=10) -> plt.Axes:
    """
    Plot calibration curve.

    Args:
        y_pred_proba (np.array): Predicted probabilities for the positive class (default).
        y_true (np.array): True binary labels (0 for not default, 1 for default).
        model_name (str): Name of the model for labeling the plot.
        figsize (Tuple[int, int]): size of the plot.
        n_bins (int): Number of bins to use for calibration curve.
    Return:
        plt.Axes: Matplotlib axis object.
    """
    prob_true, prob_pred = calibration_curve(y_true, y_pred_proba, n_bins=n_bins)
    
    plt.style.use('fivethirtyeight')
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], linestyle='--', label='Perfectly calibrated')
    ax.plot(prob_pred, prob_true, marker='o', label=model_name)
    ax.set_xlabel('Mean predicted probability')
    ax.set_ylabel('Fraction of positives')
    ax.set_title('Calibration plot')
    ax.legend()
    ax.grid(True)
        
    image = fig

    # Save figure
    fig.savefig(path)

    # Close plot
    plt.close(fig)

    return image

In [8]:
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path
import logging
from typing import Tuple
from dotenv import load_dotenv, find_dotenv
import mlflow
from urllib.parse import urlparse
from mlflow.models import infer_signature
load_dotenv(find_dotenv())

class ModelTraining:
    """
    Class to handle the model training process.
    """
    
    def __init__(self, config: ModelTrainingConfig):
        """
        Instantiate `ModelTraining` class.

        Args:
            config (ModelTrainingConfig): Configuration for model training.
        """
        self.config = config

    def train(self) -> None:
        """
            Train, evaluate, and log model to MLFlow Registry.
        """
        logger.info("Train model")
        train = pd.read_csv(self.config.train_data_path)
        X_train = train.drop(columns=[self.config.target_column])
        y_train = train[self.config.target_column]


        # Instantiate BinningProcess:
        binning_process = BinningProcess(
            X_train.columns.values, 
            **self.config.binning_process
        )

        # Instantiate LogisticRegression:
        logreg_model = LogisticRegression(**self.config.logistic_regression) 

        # Instantiate Scorecard:
        scorecard_model = Scorecard(
            binning_process=binning_process,
            estimator=logreg_model,
            **self.config.scorecard
        )

        # Train:
        scorecard_model.fit(X_train, y_train)
        
        # Predictin on Train Data:
        y_pred_proba_train = scorecard_model.predict_proba(X_train)[:, -1] 

        # Predictin on Test Data:
        test = pd.read_csv(self.config.test_data_path)
        X_test = test.drop(columns=[self.config.target_column])
        y_test = test[self.config.target_column]
        y_pred_proba_test = scorecard_model.predict_proba(X_test)[:, -1] 

        # Track Experiment using MLFlow:
        logger.info("Initialize MLFlow Tracking ...")
        mlflow.set_tracking_uri(os.getenv("MLFLOW_TRAKING_URI"))
        mlflow.set_experiment(self.config.experiment_name)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

        logger.info("Start Tracking ...")
        # Track Experiment:
        with mlflow.start_run(run_name=self.config.run_name):
            logger.info("Log Params")
            mlflow.log_params(self.config.binning_process)
            mlflow.log_params(self.config.logistic_regression)
            mlflow.log_params(self.config.scorecard)

            # Log Train Metrics
            logger.info("Log Metrics")
            mlflow.log_metric("train_roc_auc", roc_auc(y_train, y_pred_proba_train))
            mlflow.log_metric("train_pr_auc", pr_auc(y_train, y_pred_proba_train))
            mlflow.log_metric("train_gini", gini(y_train, y_pred_proba_train))
            mlflow.log_metric("train_ks", ks(y_train, y_pred_proba_train))

            # log Test Metrics:
            mlflow.log_metric("test_roc_auc", roc_auc(y_test, y_pred_proba_test))
            mlflow.log_metric("test_pr_auc", pr_auc(y_test, y_pred_proba_test))
            mlflow.log_metric("test_gini", gini(y_test, y_pred_proba_test))
            mlflow.log_metric("test_ks", ks(y_test, y_pred_proba_test))

            # Log Models:
            logger.info("Log Models")
            signature = infer_signature(X_train.iloc[:1, :], y_pred_proba_train[:1])
            mlflow.sklearn.log_model(
                scorecard_model, 
                "model",
                signature=signature,
                registered_model_name="credit-score-model",
            )

            # Log Plots:
            logger.info("Log Artifacts")
            image = plot_calibration_curve(y_train, y_pred_proba_train, "Logistic Regression", (10, 7), self.config.root_dir+"/train_model_calibration.png")
            mlflow.log_artifact(self.config.root_dir+"/train_model_calibration.png")
            image = plot_calibration_curve(y_test, y_pred_proba_test, "Logistic Regression", (10, 7), self.config.root_dir+"/test_model_calibration.png")
            mlflow.log_artifact(self.config.root_dir+"/test_model_calibration.png")

In [9]:
try:
    configuration_manager = ConfigurationManager()
    model_training = ModelTraining( 
        config=configuration_manager.get_model_training_config()
    )
    model_training.train()
except Exception as e:
    logger.error(e)

2024-06-08 13:35:20,247 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-08 13:35:20,250 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-08 13:35:20,252 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-08 13:35:20,252 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-08 13:35:20,252 - credit-scorecard-logger - INFO - Created directory at: artifacts/model_training
2024-06-08 13:35:20,252 - credit-scorecard-logger - INFO - Train model
2024-06-08 13:35:21,291 - credit-scorecard-logger - INFO - Initialize MLFlow Tracking ...
2024-06-08 13:35:21,829 - credit-scorecard-logger - INFO - Start Tracking ...
2024-06-08 13:35:22,534 - credit-scorecard-logger - INFO - Log Params
2024-06-08 13:35:23,680 - credit-scorecard-logger - INFO - Log Metrics
2024-06-08 13:35:27,201 - credit-scorecard-logger - INFO - Log Models


Successfully registered model 'credit-score-model'.
2024/06/08 13:35:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: credit-score-model, version 1


2024-06-08 13:35:43,065 - credit-scorecard-logger - INFO - Log Artifacts


Created version '1' of model 'credit-score-model'.


# Testing
Restart and run again

In [1]:
import os
os.chdir("..")

In [2]:
from src.utils.common import logger
from src.config.configuration_manager import ConfigurationManager
from src.models.model_training import ModelTraining

class ModelTrainingPipeline:
    """
    Class to manage the model training pipeline.
    """
    
    def __init__(self):
        """
        Instantiate `ModelTrainingPipeline` class.
        """
        self.configuration_manager = ConfigurationManager()

    def run(self):
        """
        Execute the model training process.
        """
        model_training = ModelTraining(
            config=self.configuration_manager.get_model_training_config()
        )
        model_training.train()

if __name__ == "__main__":
    STAGE_NAME = "Model Training Stage"
    try:
        logger.info(f">>>>>> {STAGE_NAME} Started <<<<<<")
        model_training_pipeline = ModelTrainingPipeline()
        model_training_pipeline.run()
        logger.info(f">>>>>> {STAGE_NAME} Completed <<<<<<")
    except Exception as e:
        logger.error(e)

2024-06-06 16:43:10,142 - credit-scorecard-logger - INFO - >>>>>> Model Training Stage Started <<<<<<
2024-06-06 16:43:10,142 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-06 16:43:10,154 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-06 16:43:10,154 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-06 16:43:10,154 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-06 16:43:10,154 - credit-scorecard-logger - INFO - Created directory at: artifacts/model_training
2024-06-06 16:43:10,154 - credit-scorecard-logger - INFO - Train model
2024-06-06 16:43:11,394 - credit-scorecard-logger - INFO - Initialize MLFlow Tracking ...
2024-06-06 16:43:11,808 - credit-scorecard-logger - INFO - Start Tracking ...
2024-06-06 16:43:12,527 - credit-scorecard-logger - INFO - Log Params
2024-06-06 16:43:13,687 - credit-scorecard-logger - INFO - Log Metrics
2024-06-06 16:4

Successfully registered model 'WeightOfEvidence+LogisticRegression'.
2024/06/06 16:43:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: WeightOfEvidence+LogisticRegression, version 1


2024-06-06 16:43:44,217 - credit-scorecard-logger - INFO - Log Artifacts


Created version '1' of model 'WeightOfEvidence+LogisticRegression'.


2024-06-06 16:43:46,873 - credit-scorecard-logger - INFO - >>>>>> Model Training Stage Completed <<<<<<
