In [1]:
import os

In [2]:
%pwd

'c:\\Users\\nikhil\\OneDrive\\Desktop\\ML Projects\\ipp\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\nikhil\\OneDrive\\Desktop\\ML Projects\\ipp'

In [5]:
from dataclasses import dataclass
from pathlib import Path

In [6]:
@dataclass(frozen=True)
class ModelTrainingConfig:
    root_dir : Path
    model_path : Path
    prepared_train_data : Path
    prepared_test_data : Path
    preprocessor_obj_file_path : Path

In [7]:
from insurancePP.constants import *
from insurancePP.utils.common import read_yaml, create_directories, save_object, load_object

In [8]:
class ConfigurationManager:
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_model_training_configuration(self) -> ModelTrainingConfig:
        config = self.config.model_trainer
        create_directories([config.root_dir])

        model_training_config = ModelTrainingConfig(
            root_dir = config.root_dir,
            model_path = config.model_path,
            prepared_train_data = config.prepared_train_data,
            prepared_test_data = config.prepared_test_data,
            preprocessor_obj_file_path = config.preprocessor_obj_file_path
        )

        return model_training_config
        

In [9]:
import os
import sys
import pandas as pd
import numpy as np
from numpy import load

from insurancePP.logging import logger

from insurancePP.constants import *
from insurancePP.utils.common import read_yaml, create_directories, save_object, load_object, evaluate_models

from dataclasses import dataclass
from catboost import CatBoostRegressor
from sklearn.ensemble import (
    AdaBoostRegressor,
    GradientBoostingRegressor,
    RandomForestRegressor,
)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [37]:
class ModelTraining:
    def __init__(self, config : ModelTrainingConfig):
        self.config = config
    

    def initiate_model_trainer(self):
        try:
            logger.info("loading the training and testing data")
            train_arr = load(self.config.prepared_train_data)
            test_arr = load(self.config.prepared_test_data)

            logger.info("spliting the training and testing data")
            X_train, y_train, X_test, y_test = (
                train_arr[:, :-1],
                train_arr[:, -1],
                test_arr[:, :-1],
                test_arr[:, -1],
            )

            models = {
                "Random Forest": RandomForestRegressor(),
                "Decision Tree": DecisionTreeRegressor(),
                "Gradient Boosting": GradientBoostingRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
                "AdaBoost Regressor": AdaBoostRegressor(),
            }

            params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                },
                "Random Forest":{
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Gradient Boosting":{
                    # 'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Linear Regression":{},
                "XGBRegressor":{
                    'learning_rate':[.1,.01,.05,.001],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "CatBoosting Regressor":{
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    'n_estimators': [8,16,32,64,128,256]
                }
            }

            
            model_report : dict = evaluate_models(X_train, y_train, X_test, y_test, models, params)
         
            performance_df = pd.DataFrame(list(zip(model_report.keys(), model_report.values())), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
            logger.info(f'Model Report: \n {performance_df}')

            best_model_score = max(sorted(model_report.values()))
            best_model_name = list(model_report.keys())[list(model_report.values()).index(best_model_score)]
            best_model = models[best_model_name]

            # if best_model_score < 0.6:
            #     raise e
            # logging.info(f"Best model found on both training and testing dataset")
    
            save_object(
                file_path = Path(self.config.model_path),
                obj = best_model
            )

            predict = best_model.predict(X_test)
            r2 = r2_score(y_test, predict)
            

        except Exception as e:
            raise e

In [38]:
try:
    config = ConfigurationManager()
    data_training_config = config.get_model_training_configuration()
    model_trainer = ModelTraining(config = data_training_config)
    model_trainer.initiate_model_trainer()
    
except Exception as e:
    raise e

[2024-02-25 21:29:34,497 : INFO : common : yaml file: config\config.yaml loaded successfully]
[2024-02-25 21:29:34,501 : INFO : common : yaml file: params.yaml loaded successfully]
[2024-02-25 21:29:34,504 : INFO : common : directory artifacts created]
[2024-02-25 21:29:34,506 : INFO : common : directory artifacts/model_trainer created]
[2024-02-25 21:29:34,508 : INFO : 177425094 : loading the training and testing data]
[2024-02-25 21:29:34,513 : INFO : 177425094 : spliting the training and testing data]


[2024-02-25 21:30:42,646 : INFO : 177425094 : Model Report: 
               Model Name  R2_Score
5  CatBoosting Regressor  0.884958
2      Gradient Boosting  0.880510
6     AdaBoost Regressor  0.869562
4           XGBRegressor  0.868819
0          Random Forest  0.864245
3      Linear Regression  0.778228
1          Decision Tree  0.756036]
[2024-02-25 21:30:42,663 : INFO : common : Binary Object is stored]
