In [1]:
import os

In [2]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataModelConfig:
    root_dir: Path
    data_path: Path
    model_path: Path


In [6]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml, create_directories, evaluate_models

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_data_model_config(self) -> DataModelConfig:

        print('Done')
        config = self.config.data_model

        create_directories([config.root_dir])

        data_model_config = DataModelConfig(
            root_dir=config.root_dir,
            data_path = config.data_path,
            model_path = config.model_path,
        )
        return data_model_config

In [8]:
import sys
from sklearn.ensemble import (
    RandomForestRegressor,
)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from src.mlProject import logger
from sklearn.model_selection import train_test_split
import pandas as pd
import json
import pickle

In [9]:
class ModelTrainer:
    def __init__(self, data_model_config: DataModelConfig):
        self.model_trainer_config = data_model_config  # Use the passed DataModelConfig

    def initiate_model_trainer(self):
        try:
            logger.info("Split training and test input data")

            transformed_data = pd.read_csv(self.model_trainer_config.data_path)

            print('Done')

            train, test = train_test_split(transformed_data, test_size=0.2, random_state=42)

            X_train = train.iloc[:, :-1]
            y_train = train.iloc[:, -1]

            X_test = test.iloc[:, :-1]
            y_test = test.iloc[:, -1]
            
            train_path = os.path.join(self.model_trainer_config.root_dir, "training_data.csv")

            train.to_csv(train_path, index=False)

            test_path = os.path.join(self.model_trainer_config.root_dir, "testing_data.csv")

            test.to_csv(test_path, index=False)


            models = {
                "Random Forest": RandomForestRegressor(),
                "Linear Regression": LinearRegression(),
                "XGBRegressor": XGBRegressor(),
            }

            result = {}

            model_report = evaluate_models(X_train, y_train, X_test, y_test, models)
            # To get the best model score from dict
            best_model_score = max(sorted(model_report.values()))

            # To get the best model name from dict
            best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
            print(model_report)

            result["model_report"] = model_report
            result["best_model_score"] = best_model_score
            result["best_model_name"] = best_model_name

            if best_model_score < 0.6:
                e = 'No best model found'
                raise e
            logger.info(f"Best found model on both training and testing dataset")

            print(self.model_trainer_config.model_path)

            
            os.makedirs(self.model_trainer_config.model_path, exist_ok=True)

            # Save the evaluation report as a JSON file in the same directory
            report_file_path = os.path.join(self.model_trainer_config.model_path, "report.json")
            with open(report_file_path, "w") as file:
                json.dump(result, file, indent=4)

            logger.info(f"Report saved at {report_file_path}")

        except Exception as e:
            raise e

In [10]:

try:
    config = ConfigurationManager()
    data_model_config = config.get_data_model_config()
    training = ModelTrainer(data_model_config)  
    training.initiate_model_trainer()

except Exception as e:
    raise e


[2024-02-02 16:45:52,125: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-02 16:45:52,135: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-02 16:45:52,146: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-02 16:45:52,151: INFO: common: created directory at: artifacts]
Done
[2024-02-02 16:45:52,159: INFO: common: created directory at: artifacts/data_cleaning]
DOne
[2024-02-02 16:45:52,163: INFO: 1421504067: Split training and test input data]


Done
RandomForestRegressor()
LinearRegression()
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
{'Random Forest': 0.8098554736094523, 'Linear Regression': 0.5708953540843202, 'XGBRegressor': 0.8322409007304338}
[2024-02-02 16:46:05,084: INFO: 1421504067: Best found model on both training and testing 