In [1]:
import os

In [2]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction'

In [54]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTuningConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    params_report: Path
    results_report: Path
    model_report: Path
    all_params: dict


In [6]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml, create_directories

In [55]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_model_train_config(self) -> DataTuningConfig:

        config = self.config.evaluate_model
        params = self.params

        create_directories([config.root_dir])

        data_train_config = DataTuningConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path = config.test_data_path,
            params_report = config.params_report,
            results_report = config.results_report,
            model_report = config.model_report,
            all_params = params
        )

        return data_train_config

In [8]:
import sys
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from src.mlProject import logger
import pandas as pd
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

In [56]:
class ModelTuningEvaluate:
    def __init__(self, config: DataTuningConfig):
        self.config = config

    def grid_search_cv(self):
        try:

            train_data = pd.read_csv(self.config.train_data_path)

            test_data = pd.read_csv(self.config.test_data_path)

            X_train = train_data.iloc[:, :-1]
            y_train = train_data.iloc[:, -1]

            X_test = test_data.iloc[:, :-1]
            y_test = test_data.iloc[:, -1]

                
            with open(self.config.model_report, 'r') as file:
                best_model_name = json.load(file)

            if best_model_name['best_model_name'] == 'Random Forest':
                best_model = RandomForestRegressor()
            elif best_model_name['best_model_name'] == 'Ridge Regression':
                best_model = Ridge()
            elif best_model_name['best_model_name'] == 'XGBRegressor':
                best_model = XGBRegressor()

            params = {
                    #'learning_rate': [0.001, 0.01, 0.1],
                    #'n_estimators': [100, 200, 300]
                    #'max_depth': [3, 4, 5, 6],
                    #'min_child_weight': [1, 3, 5],
                    #'subsample': [0.8, 0.9, 1.0],
                    #'colsample_bytree': [0.8, 0.9, 1.0],
                    #'gamma': [0, 0.1, 0.2],
                    #'reg_alpha': [0, 0.1, 0.5, 1.0],
                    'reg_lambda': [0, 0.1, 0.5, 1.0],
                }


            gs = GridSearchCV(best_model, params, cv=5)
            gs.fit(X_train, y_train)
            
            best_params = gs.best_params_
            print(best_params)
            # Save the best parameters to a JSON file

            param_file_path = os.path.join(self.config.root_dir, "params_report.json")

            # Check if the file already exists
            if os.path.exists(param_file_path):
                # Read the existing JSON data
                with open(param_file_path, 'r') as file:
                    existing_data = json.load(file)
            else:
                # If the file doesn't exist, initialize existing_data as an empty dictionary
                existing_data = {}

            # Update the existing_data with the new data
            existing_data.update(best_params)

            # Write the updated data back to the JSON file
            with open(param_file_path, 'w') as file:
                json.dump(existing_data, file)
             
            return X_train, y_train, X_test, y_test, best_params

        except Exception as e:
            raise e
        

    

    

In [60]:
class Training:
    def __init__(self, config):
        self.config = config

    def train_model(self, X_train, y_train, X_test, y_test):
        try:
            model_filepath = os.path.join(self.config.root_dir, 'trained_model.pkl')

            with open(os.path.join(self.config.model_report), 'r') as file:
                best_model_name = json.load(file)

            if best_model_name['best_model_name'] == 'Random Forest':
                best_model = RandomForestRegressor()
            elif best_model_name['best_model_name'] == 'Ridge Regression':
                best_model = Ridge()
            elif best_model_name['best_model_name'] == 'XGBRegressor':
                best_model = XGBRegressor()

            best_model.set_params(**self.config.all_params)

            best_model.fit(X_train, y_train)
            print('Model Trained')

            # Save the trained model
            with open(model_filepath, 'wb') as file:
                pickle.dump(best_model, file)

            # Load existing results if available
            results_path = os.path.join(self.config.root_dir, "results.json")
            if os.path.exists(results_path):
                with open(results_path, 'r') as file:
                    existing_results = json.load(file)
            else:
                existing_results = {}

            # Update results with new metrics
            n_train = X_train.shape[0]
            n_test = X_test.shape[0]
            k = X_train.shape[1]

            y_train_pred = best_model.predict(X_train)
            y_test_pred = best_model.predict(X_test)

            train_model_r2_score = r2_score(y_train, y_train_pred)

            test_model_mae = mean_absolute_error(y_test, y_test_pred)
            test_model_mse = mean_squared_error(y_test, y_test_pred)
            test_model_r2_score = r2_score(y_test, y_test_pred)

            new_results = {
                "train_data_r2_score": train_model_r2_score,
                "test_data_r2_score": test_model_r2_score,
                "test_data_mae": test_model_mae,
                "test_data_mse": test_model_mse,
            }

            # Update existing results with new results
            existing_results.update(new_results)

            # Write updated results back to the file
            with open(results_path, 'w') as file:
                json.dump(existing_results, file)

            print(new_results)

        except Exception as e:
            print(f"An error occurred during training: {e}")

In [58]:
try:
    config = ConfigurationManager()
    data_evaluation_config = config.get_model_train_config()
    data_validation = ModelTuningEvaluate(config=data_evaluation_config)
    X_train, y_train, X_test, y_test, best_params = data_validation.grid_search_cv()
    print(best_params)
except Exception as e:
    raise e

[2024-02-02 09:28:14,043: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-02 09:28:14,048: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-02 09:28:14,055: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-02 09:28:14,058: INFO: common: created directory at: artifacts]
[2024-02-02 09:28:14,060: INFO: common: created directory at: artifacts/model]


{'reg_lambda': 1.0}
{'reg_lambda': 1.0}


In [61]:
try:
    config = ConfigurationManager()
    data_evaluation_config = config.get_model_train_config()

    # Create an instance of the Training class
    training_instance = Training(data_evaluation_config)

    # Call the train_model method on the instance
    training_instance.train_model(X_train, y_train, X_test, y_test)

except Exception as e:
    raise e


[2024-02-02 09:29:33,540: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-02 09:29:33,548: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-02 09:29:33,557: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-02 09:29:33,560: INFO: common: created directory at: artifacts]
[2024-02-02 09:29:33,563: INFO: common: created directory at: artifacts/model]


Model Trained
{'train_data_r2_score': 0.890169521320519, 'test_data_r2_score': 0.8254706996903993, 'test_data_mae': 1232.20095800408, 'test_data_mse': 3638984.1474980656}
