In [1]:
import os

In [2]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Project\\end_to_end_project\\Flight_Fare_prediction'

In [5]:
os.environ["MLFLOW_TRACKING_URI"]="https://dagshub.com/manikandan-1928/Flight_Fare_prediction.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"]="manikandan-1928"
os.environ["MLFLOW_TRACKING_PASSWORD"]="01ae2fa97ab661b59c00d1a95b9cb76a87fc1f31"

In [6]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataTuningConfig:
    root_dir: Path
    train_data_path: Path
    test_data_path: Path
    params_report: Path
    results_report: Path
    model_report: Path
    all_params: dict
    mlflow_uri: str



In [7]:
from src.mlProject.constants import *
from src.mlProject.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_model_train_config(self) -> DataTuningConfig:

        config = self.config.evaluate_model
        params = self.params

        create_directories([config.root_dir])

        data_train_config = DataTuningConfig(
            root_dir=config.root_dir,
            train_data_path=config.train_data_path,
            test_data_path = config.test_data_path,
            params_report = config.params_report,
            results_report = config.results_report,
            model_report = config.model_report,
            all_params = params,
            mlflow_uri= 'https://dagshub.com/manikandan-1928/Flight_Fare_prediction.mlflow'
        )

        return data_train_config

In [9]:
import sys
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from src.mlProject import logger
import pandas as pd
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import mlflow
from urllib.parse import urlparse


In [10]:
class ModelTuningEvaluate:
    def __init__(self, config: DataTuningConfig):
        self.config = config

    def grid_search_cv(self):
        try:

            logger.info("Splitting the data to train and test sets")


            train_data = pd.read_csv(self.config.train_data_path)

            test_data = pd.read_csv(self.config.test_data_path)

            X_train = train_data.iloc[:, :-1]
            y_train = train_data.iloc[:, -1]

            X_test = test_data.iloc[:, :-1]
            y_test = test_data.iloc[:, -1]

            best_model = XGBRegressor()        
            logger.info("Setting the values for hyperparameter tuning")

            params = {
                    #'learning_rate': [0.001, 0.01, 0.1],
                    #'n_estimators': [100, 200, 300, 400]
                    #'max_depth': [3, 6, 9, 12],
                    #'min_child_weight': [1, 3, 5],
                    #'subsample': [0.8, 0.9, 1.0],
                    #'colsample_bytree': [0.8, 0.9, 1.0],
                    #'gamma': [0, 0.1, 0.2],
                    #'reg_alpha': [0, 0.1, 0.5, 1.0],
                    'reg_lambda': [0, 0.1, 0.5, 1.0],
                }

            logger.info("Applying GridSearchCV method")

            gs = GridSearchCV(best_model, params, cv=5)
            gs.fit(X_train, y_train)
            
            best_params = gs.best_params_
            print(best_params)
            # Save the best parameters to a JSON file

            param_file_path = os.path.join(self.config.root_dir, "params_report.json")

            # Check if the file already exists
            if os.path.exists(param_file_path):
                # Read the existing JSON data
                with open(param_file_path, 'r') as file:
                    existing_data = json.load(file)
            else:
                # If the file doesn't exist, initialize existing_data as an empty dictionary
                existing_data = {}

            # Update the existing_data with the new data
            existing_data.update(best_params)

            # Write the updated data back to the JSON file
            with open(param_file_path, 'w') as file:
                json.dump(existing_data, file)
             
            logger.info("Best params are saved")

        except Exception as e:
            raise e
        

In [16]:
class Training:
    def __init__(self, config):
        self.config = config

    def train_model(self):
        try:

            train_data = pd.read_csv(self.config.train_data_path)

            test_data = pd.read_csv(self.config.test_data_path)

            X_train = train_data.iloc[:, :-1]
            y_train = train_data.iloc[:, -1]

            X_test = test_data.iloc[:, :-1]
            y_test = test_data.iloc[:, -1]


            model_filepath = os.path.join(self.config.root_dir, 'trained_model.h5')

            # Load the JSON file
            with open(self.config.model_report, 'r') as file:
                data = json.load(file)

            # Extract the best_model_name
            best_model_name = data["best_model_name"]

            print(best_model_name)
            logger.info("Best model is chosen")


            if best_model_name == 'Random Forest':
                best_model = RandomForestRegressor()

            elif best_model_name == 'Linear Regression':
                best_model = LinearRegression()

            elif best_model_name == 'XGBRegressor':
                best_model = XGBRegressor()

            logger.info("Setting the best params to the chosen model")

            best_model.set_params(**self.config.all_params)
            logger.info("Model trainig is trained with the best parameters")


            best_model.fit(X_train, y_train)
            print('Model Trained')

            # Save the trained model
            with open(model_filepath, 'wb') as file:
                pickle.dump(best_model, file)


            with open('model.h5', 'wb') as file:
                pickle.dump(best_model, file)
                

            logger.info("Training completed and model saved")

            # Load existing results if available
            results_path = os.path.join(self.config.root_dir, "results.json")
            if os.path.exists(results_path):
                with open(results_path, 'r') as file:
                    existing_results = json.load(file)
            else:
                existing_results = {}

            logger.info("Model prediction for evaluation")

            y_train_pred = best_model.predict(X_train)
            y_test_pred = best_model.predict(X_test)
            logger.info("Metrics calculation beginning")
           

            train_model_r2_score = r2_score(y_train, y_train_pred)

            test_model_mae = mean_absolute_error(y_test, y_test_pred)
            test_model_mse = mean_squared_error(y_test, y_test_pred)
            test_model_r2_score = r2_score(y_test, y_test_pred)

            new_results = {
                "train_data_r2_score": train_model_r2_score,
                "test_data_r2_score": test_model_r2_score,
                "test_data_mae": test_model_mae,
                "test_data_mse": test_model_mse,
            }

            # Update existing results with new results
            existing_results.update(new_results)

            # Write updated results back to the file
            with open(results_path, 'w') as file:
                json.dump(existing_results, file)

            logger.info("Evaluation finished and saved results")


            return new_results, best_model
        
        except Exception as e:
            raise e

    def log_into_mlflow(self, results, best_model):

        try:

            mlflow.set_registry_uri(self.config.mlflow_uri)
            tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
            
            with mlflow.start_run():
                mlflow.log_params(self.config.all_params)

                # Log metrics
                mlflow.log_metrics(results)

                # Log the model into MLflow
                if tracking_url_type_store != "file":
                    # Register the model in the Model Registry
                    mlflow.xgboost.log_model(best_model, "model", registered_model_name="XGBRegressor")
                else:
                    mlflow.xgboost.log_model(best_model, "model")

        except Exception as e:
            raise e

                
    

In [18]:
try:
    config = ConfigurationManager()
    data_evaluation_config = config.get_model_train_config()
    data_validation = ModelTuningEvaluate(config=data_evaluation_config)
    data_validation.grid_search_cv()
except Exception as e:
    raise e

[2024-02-02 18:39:22,833: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-02 18:39:22,844: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-02 18:39:22,858: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-02 18:39:22,863: INFO: common: created directory at: artifacts]
[2024-02-02 18:39:22,870: INFO: common: created directory at: artifacts/model]


{'reg_lambda': 1.0}


In [18]:
try:
    config = ConfigurationManager()
    data_evaluation_config = config.get_model_train_config()

    # Create an instance of the Training class
    training_instance = Training(data_evaluation_config)

    # Call the train_model method on the instance
    results, model = training_instance.train_model()
    training_instance.log_into_mlflow(results, model)

except Exception as e:
    raise e


[2024-02-03 23:48:58,115: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-02-03 23:48:58,123: INFO: common: yaml file: params.yaml loaded successfully]
[2024-02-03 23:48:58,131: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-02-03 23:48:58,139: INFO: common: created directory at: artifacts]
[2024-02-03 23:48:58,155: INFO: common: created directory at: artifacts/model]


XGBRegressor
[2024-02-03 23:48:58,904: INFO: 270427470: Best model is chosen]
[2024-02-03 23:48:58,912: INFO: 270427470: Setting the best params to the chosen model]
[2024-02-03 23:48:58,912: INFO: 270427470: Model trainig is trained with the best parameters]
Model Trained
[2024-02-03 23:49:01,275: INFO: 270427470: Training completed and model saved]
[2024-02-03 23:49:01,275: INFO: 270427470: Model prediction for evaluation]
[2024-02-03 23:49:01,396: INFO: 270427470: Metrics calculation beginning]
[2024-02-03 23:49:01,412: INFO: 270427470: Evaluation finished and saved results]


Registered model 'XGBRegressor' already exists. Creating a new version of this model...
2024/02/03 23:49:28 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation.                     Model name: XGBRegressor, version 13
Created version '13' of model 'XGBRegressor'.
