# Model Training Pipeline Development
- Author: Marcellinus Aditya Witarsah
- Date: 05 June 2024

In [17]:
# Imports
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import logging
import time
import pickle
import os
from pathlib import Path
from abc import ABC
from abc import abstractmethod
from scipy import stats
from typing import Tuple
from typing import Union
from dataclasses import dataclass
from src.utils.common import logger
from src.utils.common import read_yaml, create_directories
from src.constants import CONFIG_FILE_PATH, SCHEMA_FILE_PATH, PARAMS_FILE_PATH
from sklearn.linear_model import LogisticRegression
from optbinning import Scorecard
from optbinning import BinningProcess

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# run once only
os.chdir("..")

# Configuration

In [20]:
# src/entities/config_entity.py
@dataclass(frozen=True)
class ModelTrainingConfig:
    """
    Data class for storing model training configuration.

    Attributes:
        root_dir (Path): Root directory for model training.
        train_data_path (Path): Path to the training data.
        model_path (Path): Path to save the trained model.
        target_column (str): The name of the target column.
        BinningProcess (dict): Configuration for the binning process.
        LogisticRegression (dict): Configuration for logistic regression.
        Scorecard (dict): Configuration for the scorecard.
    """
    root_dir: Path
    train_data_path: Path
    model_path: Path
    target_column: str
    BinningProcess: dict
    LogisticRegression: dict
    Scorecard: dict

# src/config/configuration_manager.py
class ConfigurationManager:
    """
    Prepare ConfigurationManager class.
    
    This class is responsible for reading configuration files and preparing
    configuration settings for the pipeline.

    Attributes:
        config (dict): Parsed configuration file content.
        params (dict): Parsed parameters file content.
        schema (dict): Parsed schema file content.
    """
    def __init__(
        self,
        config_filepath: str = CONFIG_FILE_PATH, 
        params_filepath: str = PARAMS_FILE_PATH, 
        schema_filepath: str = SCHEMA_FILE_PATH
    ):
        """
        Initialize the ConfigurationManager with file paths.

        Args:
            config_filepath (str): File path to the configuration YAML file.
            params_filepath (str): File path to the parameters YAML file.
            schema_filepath (str): File path to the schema YAML file.
        """
        self.config = read_yaml(Path(config_filepath))
        self.params = read_yaml(Path(params_filepath))
        self.schema = read_yaml(Path(schema_filepath))
        create_directories([self.config.artifacts_root])

    def get_model_training_config(self) -> ModelTrainingConfig:
        """
        Get configuration for model training.
        
        Returns:
            ModelTrainingConfig: Configuration for model training.
        """
        config = self.config.model_training
        params = self.params

        create_directories([config.root_dir])

        model_training_config = ModelTrainingConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            model_path=config.model_path,
            target_column=params.data_preprocessing.split_data.target_column,
            BinningProcess=params.BinningProcess,
            LogisticRegression=params.LogisticRegression,
            Scorecard=params.Scorecard,
        )
        return model_training_config


In [21]:
configuration_manager = ConfigurationManager()
configuration_manager.get_model_training_config()

2024-06-05 15:58:54,109 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-05 15:58:54,113 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-05 15:58:54,116 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-05 15:58:54,117 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-05 15:58:54,119 - credit-scorecard-logger - INFO - Created directory at: artifacts/model_training


ModelTrainingConfig(root_dir='artifacts/model_training', train_data_path='artifacts/data_preprocessing/train.csv', model_path='artifacts/model_training/model.pkl', target_column='loan_status', BinningProcess=ConfigBox({'categorical_variables': ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'], 'selection_criteria': {'iv': {'min': 0.02, 'max': 1}}}), LogisticRegression=ConfigBox({'random_state': 42}), Scorecard=ConfigBox({'scaling_method': 'pdo_odds', 'scaling_method_params': {'pdo': 20, 'odds': 1, 'scorecard_points': 500}, 'intercept_based': True}))

# Data Preprocessing

In [22]:
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path
import logging
from typing import Tuple

# src/data/model_training.py
class ModelTraining:
    """
    Class to handle the model training process.
    """
    
    def __init__(self, config: ModelTrainingConfig):
        """
        Instantiate `ModelTraining` class.

        Args:
            config (ModelTrainingConfig): Configuration for model training.
        """
        self.config = config

    def train(self) -> None:
        """
        Train and save the model.
        """
        logger.info("Train model")
        train = pd.read_csv(self.config.train_data_path) 

        X_train = train.drop(columns=[self.config.target_column])
        y_train = train[self.config.target_column]

        # Instantiate BinningProcess
        binning_process = BinningProcess(
            X_train.columns.values, 
            **self.config.BinningProcess
        )
        # Instantiate LogisticRegression
        logreg_model = LogisticRegression(**self.config.LogisticRegression) 

        # Instantiate Scorecard Model
        scorecard = Scorecard(
            binning_process=binning_process,
            estimator=logreg_model,
            **self.config.Scorecard
        )

        # Train
        scorecard.fit(X_train, y_train)

        # Save model
        scorecard.save(self.config.model_path)

In [24]:
try:
    configuration_manager = ConfigurationManager()
    model_training = ModelTraining( 
        config=configuration_manager.get_model_training_config()
    )
    model_training.train()
except Exception as e:
    logger.error(e)

2024-06-05 16:00:31,292 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-05 16:00:31,296 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-05 16:00:31,300 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-05 16:00:31,301 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-05 16:00:31,302 - credit-scorecard-logger - INFO - Created directory at: artifacts/model_training
2024-06-05 16:00:31,303 - credit-scorecard-logger - INFO - Train model


# Testing
Restart and run again

In [3]:
import os
os.chdir("..")

In [5]:
from src.utils.common import logger
from src.config.configuration_manager import ConfigurationManager
from src.models.model_training import ModelTraining

class ModelTrainingPipeline:
    """
    Class to manage the model training pipeline.
    """
    
    def __init__(self):
        """
        Instantiate `ModelTrainingPipeline` class.
        """
        self.configuration_manager = ConfigurationManager()

    def run(self):
        """
        Execute the model training process.
        """
        model_training = ModelTraining(
            config=self.configuration_manager.get_model_training_config()
        )
        model_training.train()

if __name__ == "__main__":
    STAGE_NAME = "Model Training Stage"
    try:
        logger.info(f">>>>>> {STAGE_NAME} Started <<<<<<")
        model_training_pipeline = ModelTrainingPipeline()
        model_training_pipeline.run()
        logger.info(f">>>>>> {STAGE_NAME} Completed <<<<<<")
    except Exception as e:
        logger.error(e)

2024-06-05 16:13:33,295 - credit-scorecard-logger - INFO - >>>>>> Model Training Stage Started <<<<<<
2024-06-05 16:13:33,301 - credit-scorecard-logger - INFO - yaml file: config.yaml loaded successfully
2024-06-05 16:13:33,308 - credit-scorecard-logger - INFO - yaml file: params.yaml loaded successfully
2024-06-05 16:13:33,312 - credit-scorecard-logger - INFO - yaml file: schema.yaml loaded successfully
2024-06-05 16:13:33,314 - credit-scorecard-logger - INFO - Created directory at: artifacts
2024-06-05 16:13:33,315 - credit-scorecard-logger - INFO - Created directory at: artifacts/model_training
2024-06-05 16:13:33,317 - credit-scorecard-logger - INFO - Train model
2024-06-05 16:13:34,406 - credit-scorecard-logger - INFO - >>>>>> Model Training Stage Completed <<<<<<
