# Model Evaluation Pipeline Development
- Author: Marcellinus Aditya Witarsah
- Date: 05 June 2024

In [1]:
# Imports
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import logging
import time
import pickle
import os
from pathlib import Path
from abc import ABC
from abc import abstractmethod
from scipy import stats
from typing import Tuple
from typing import Union
from dataclasses import dataclass
from src.utils.common import logger
from src.utils.common import read_yaml, create_directories
from src.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH, PARAMS_FILE_PATH
from sklearn.linear_model import LogisticRegression
from optbinning import Scorecard
from optbinning import BinningProcess

In [2]:
# run once only
os.chdir("..")

# Configuration

In [4]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

# src/entities/config_entity.py
@dataclass(frozen=True)
class ModelEvaluationConfig:
    """
    Data class for storing model training configuration.

    Attributes:
        root_dir (Path): Root directory for model training.
        train_data_path (Path): Path to the training data.
        model_path (Path): Path to save the trained model.
        target_column (str): The name of the target column.
        BinningProcess (dict): Configuration for the binning process.
        LogisticRegression (dict): Configuration for logistic regression.
        Scorecard (dict): Configuration for the scorecard.
    """
    root_dir: Path
    test_data_path: Path
    model_path: Path
    metric_file_name: Path
    target_column: str
    mlflow_uri: str
    mlflow_username: str
    mlflow_password: str

# src/config/configuration_manager.py
class ConfigurationManager:
    """
    Prepare ConfigurationManager class.
    
    This class is responsible for reading configuration files and preparing
    configuration settings for the pipeline.

    Attributes:
        config (dict): Parsed configuration file content.
        params (dict): Parsed parameters file content.
        schema (dict): Parsed schema file content.
    """
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH, 
        params_filepath: str = PARAMS_FILE_PATH, 
        schema_filepath: str = SCHEMA_FILE_PATH
    ):
        """
        Initialize the ConfigurationManager with file paths.

        Args:
            config_filepath (str): File path to the configuration YAML file.
            params_filepath (str): File path to the parameters YAML file.
            schema_filepath (str): File path to the schema YAML file.
        """
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        self.schema = read_yaml(Path(schema_filepath))
        create_directories([self.config.artifacts_root])

    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        """
        Get configuration for model training.
        
        Returns:
            ModelEvaluationConfig: Configuration for model training.
        """
        config = self.config.model_evaluation
        schema = self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            test_data_path=config.test_data_path,
            model_path=config.model_path,
            metric_file_name=config.metric_file_name,
            target_column=schema.name,
            mlflow_uri=os.getenv("MLFLOW_TRAKING_URI"),
            mlflow_username=os.getenv("MLFLOW_TRACKING_USERNAME"),
            mlflow_password=os.getenv("MLFLOW_TRACKING_PASSWORD")
        )
        return model_evaluation_config


In [5]:
configuration_manager = ConfigurationManager()
# configuration_manager.get_model_evaluation_config()

2024-06-06 09:21:53,421 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-06 09:21:53,426 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-06 09:21:53,428 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-06 09:21:53,428 - credit-scorecard-logger - INFO - Created directory at: artifacts


# Model Evaluation

In [16]:
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path
import logging
from typing import Tuple
import joblib
from sklearn.metrics import roc_auc_score, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
import mlflow
from urllib.parse import urlparse
import json

# src/data/model_training.py
class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    def roc_auc(self, y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
        """
        Calculate ROC AUC (Area Under the Receiver Operating Characteristic Curve).
        
        Args:
            y_true (Union[list, np.array]): True labels.
            y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
        Returns:
            float: ROC AUC score.
        """
        return roc_auc_score(y_true, y_pred_proba)

    def pr_auc(self, y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
        """
        Calculate PR AUC (Area Under the Precision Recall Curve).
        
        Args:
            y_true (Union[list, np.array]): True labels.
            y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
        Returns:
            float: PR AUC score.
        """
        precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
        return auc(recall, precision)

    def gini(self, y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
        """
        Calculate Gini coefficient.

        Args:
            y_true (Union[list, np.array]): True labels.
            y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
        Returns:
            float: Gini coefficient.
        """
        fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
        roc_auc = roc_auc_score(y_true, y_pred_proba)
        return 2 * roc_auc - 1

    def ks(self, y_true: Union[list, np.array], y_pred_proba: Union[list, np.array]) -> float:
        """
        Calculate Kolmogorov-Smirnov (KS) statistic.

        Args:
            y_true (Union[list, np.array]): True labels.
            y_pred_prob (Union[list, np.array]): Prediction probability of target class of `1`
        Returns:
            float: KS statistic.
        """
        y_pred_proba_not_default = y_pred_proba[y_true == 0]
        y_pred_proba_default = y_pred_proba[y_true == 1]
        ks_stat, p_value = stats.ks_2samp(y_pred_proba_not_default, y_pred_proba_default)
        return ks_stat

    def get_evaluation_metrics(self, y_true: Union[pd.Series, np.array], y_pred_proba: Union[pd.Series, np.array]) -> Tuple[float, float, float, float]:
        roc_auc = self.roc_auc(y_true, y_pred_proba)
        pr_auc = self.pr_auc(y_true, y_pred_proba)
        gini = self.gini(y_true, y_pred_proba)
        ks = self.ks(y_true, y_pred_proba)
        return (roc_auc, pr_auc, gini, ks)

    def evaluate(self):
        os.environ["MLFLOW_TRAKING_URI"] = self.config.mlflow_uri
        os.environ["MLFLOW_TRACKING_USERNAME"] = self.config.mlflow_username
        os.environ["MLFLOW_TRACKING_PASSWORD"] = self.config.mlflow_password

        test = pd.read_csv(self.config.test_data_path)
        model = joblib.load(self.config.model_path)

        X_test = test.drop(columns=[self.config.target_column], axis=1)
        y_test = test[self.config.target_column]

        mlflow.set_tracking_uri(self.config.mlflow_uri)
        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        
        with mlflow.start_run():
            y_pred_proba = model.predict_proba(X_test)[:, -1]
            (roc_auc, pr_auc, gini, ks) = self.get_evaluation_metrics(y_test, y_pred_proba)
            scores = {
                'roc_auc': roc_auc,
                'pr_auc': pr_auc,
                'gini': gini,
                'ks': ks
            }
            with open(self.config.metric_file_name, "w") as f:
                json.dump(scores, f)
            # mlflow.log_params(self.config.model_params)
            mlflow.log_metric("roc_auc", roc_auc)
            mlflow.log_metric("pr_auc", pr_auc)
            mlflow.log_metric("gini", gini)
            mlflow.log_metric("ks", ks)

            if tracking_url_type_store != 'file':
                mlflow.sklearn.log_model(model, "model", registered_model_name="WeightOfEvidence+LogisticRegression")
            else:
                mlflow.sklearn.log_model(model, "model")

In [17]:
try:
    configuration_manager = ConfigurationManager()
    model_evaluation = ModelEvaluation( 
        config=configuration_manager.get_model_evaluation_config()
    )
    model_evaluation.evaluate()
except Exception as e:
    logger.error(e)

2024-06-06 09:27:26,987 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-06 09:27:27,005 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-06 09:27:27,005 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-06 09:27:27,005 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-06 09:27:27,005 - credit-scorecard-logger - INFO - Created directory at: artifacts/model_evaluation
2024-06-06 09:27:27,638 - credit-scorecard-logger - ERROR - INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}


# Testing
Restart and run again

In [None]:
import os
os.chdir("..")

In [None]:
from src.utils.common import logger
from src.config.configuration_manager import ConfigurationManager
from src.models.model_training import ModelTraining

class ModelTrainingPipeline:
    """
    Class to manage the model training pipeline.
    """
    
    def __init__(self):
        """
        Instantiate `ModelTrainingPipeline` class.
        """
        self.configuration_manager = ConfigurationManager()

    def run(self):
        """
        Execute the model training process.
        """
        model_training = ModelTraining(
            config=self.configuration_manager.get_model_training_config()
        )
        model_training.train()

if __name__ == "__main__":
    STAGE_NAME = "Model Training Stage"
    try:
        logger.info(f">>>>>> {STAGE_NAME} Started <<<<<<")
        model_training_pipeline = ModelTrainingPipeline()
        model_training_pipeline.run()
        logger.info(f">>>>>> {STAGE_NAME} Completed <<<<<<")
    except Exception as e:
        logger.error(e)