In [1]:
import os

In [2]:
%pwd

'e:\\projects\\Delivery-time-prediction-for-food-devlivery-industry\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\projects\\Delivery-time-prediction-for-food-devlivery-industry'

In [91]:
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    data_input_dir:Path
    model_path: Path
    metric_file: Path
    all_params: dict
    prepro_dir: Path
    #metric_file_name: Path
    #target_column: str


In [92]:
from pathlib import Path

CONFIG_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\config\config.yaml")
PARAMS_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\params.yaml")
SCHEMA_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\schema.yaml")

In [93]:
from Deliveryprediction.constants import *
from Deliveryprediction.utils.common import read_yaml, create_directories


In [94]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation
        params = self.params.LightGBM
        #schema =  self.schema.TARGET_COLUMN

        create_directories([config.root_dir])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=config.root_dir,
            data_input_dir=config.data_input_dir,
            model_path = config.model_path,
            prepro_dir = config.prepro_dir,
            all_params=params,
            metric_file = config.metric_file,
            #target_column = schema.name
           
        )

        return model_evaluation_config

In [98]:
import pandas as pd
import joblib
from Deliveryprediction import logger
import mlflow
import dagshub
from pathlib import Path
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
import json

class ModelEvaluation:
    
    TARGET_COLUMN = "time_taken"
    def __init__(self,logger, repo_owner, repo_name, experiment_name, config: ModelEvaluationConfig):
        self.config = config
        self.logger = logger
        self.target_column = "time_taken"
        self.root_dir = Path(self.config.root_dir)
        self.root_path = Path(self.config.data_input_dir)
        self.prepro_dir = Path(self.config.prepro_dir)
        self.train_data_path = self.root_path / "train_trans.csv"
        self.test_data_path = self.root_path / "test_trans.csv"
        self.model_path = Path(self.config.model_path)
        self.metric_path = Path(self.config.metric_file)

        # self.save_data_dir = Path(self.config.root_dir)
        # self.save_data_dir.mkdir(exist_ok=True, parents=True)

        # self.model_save_dir = self.save_data_dir / "models"
        # self.model_save_dir.mkdir(exist_ok=True)
        # self.training_data = None
        # self.model = None
        # self.stacking_model = None
        # self.transformer = None
         
        # Initialize Dagshub and MLflow
        dagshub.init(repo_owner=repo_owner, repo_name=repo_name, mlflow=True)
        mlflow.set_tracking_uri(f"https://dagshub.com/{repo_owner}/{repo_name}.mlflow")
        mlflow.set_experiment(experiment_name)
    
       
    def load_data(self, data_path: Path) -> pd.DataFrame:
        try:
            df = pd.read_csv(data_path)
            self.logger.info(f"Data loaded successfully from {data_path}")
            return df
        except FileNotFoundError:
            self.logger.error(f"File not found: {data_path}")
            return None
    
    def split_data(self, data: pd.DataFrame):
        X = data.drop(columns=[self.target_column])
        y = data[self.target_column]
        return X, y
    
    def load_model(self, model_path: Path):
        try:
            model = joblib.load(model_path)
            self.logger.info("Model loaded successfully")
            return model
        except FileNotFoundError:
            self.logger.error(f"Model file not found: {model_path}")
            return None
    
    def save_model_info(self, metric_path: Path, run_id, artifact_path, model_name):
        info_dict = {"run_id": run_id, "artifact_path": artifact_path, "model_name": model_name}
        with open(metric_path, "w") as f:
            json.dump(info_dict, f, indent=4)
        self.logger.info("Model information saved")
    
    def evaluate_model(self, model, X_train, y_train, X_test, y_test):
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        train_mae = mean_absolute_error(y_train, y_train_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)
        mean_cv_score = -cv_scores.mean()
        
        self.logger.info("Model evaluation completed")
        return train_mae, test_mae, train_r2, test_r2, mean_cv_score, cv_scores
    
    def log_metrics_to_mlflow(self, model, train_mae, test_mae, train_r2, test_r2, mean_cv_score, cv_scores, X_train, train_data, test_data, root_path):
        with mlflow.start_run() as run:
            mlflow.set_tag("model", "Food Delivery Time Regressor")
            mlflow.log_params(model.get_params())
            mlflow.log_metric("train_mae", train_mae)
            mlflow.log_metric("test_mae", test_mae)
            mlflow.log_metric("train_r2", train_r2)
            mlflow.log_metric("test_r2", test_r2)
            mlflow.log_metric("mean_cv_score", mean_cv_score)
            mlflow.log_metrics({f"CV {num}": -score for num, score in enumerate(cv_scores)})
            
            train_data_input = mlflow.data.from_pandas(train_data, targets=self.target_column)
            test_data_input = mlflow.data.from_pandas(test_data, targets=self.target_column)
            mlflow.log_input(dataset=train_data_input, context="training")
            mlflow.log_input(dataset=test_data_input, context="validation")
            
            model_signature = mlflow.models.infer_signature(X_train.sample(20, random_state=42), model.predict(X_train.sample(20, random_state=42)))
            mlflow.sklearn.log_model(model, "delivery_time_pred_model", signature=model_signature)
            
            mlflow.log_artifact(self.root_dir / "models" / "stacking_regressor.joblib")
            mlflow.log_artifact(self.root_dir / "models" / "power_transformer.joblib")
            mlflow.log_artifact(self.prepro_dir)
            
            artifact_uri = mlflow.get_artifact_uri()
            self.logger.info("MLflow logging complete")
            return run.info.run_id, artifact_uri
    
    def run(self):
        
        train_data = self.load_data(self.train_data_path)
        test_data = self.load_data(self.test_data_path)
        X_train, y_train = self.split_data(train_data)
        X_test, y_test = self.split_data(test_data)
        
        model = self.load_model(self.model_path)
        train_mae, test_mae, train_r2, test_r2, mean_cv_score, cv_scores = self.evaluate_model(model, X_train, y_train, X_test, y_test)
        
        run_id, artifact_uri = self.log_metrics_to_mlflow(model, train_mae, test_mae, train_r2, test_r2, mean_cv_score, cv_scores, X_train, train_data, test_data, self.root_path)
        
        save_json_path = self.metric_path
        self.save_model_info(save_json_path, run_id, artifact_uri, "delivery_time_pred_model")


In [99]:
config = ConfigurationManager()
model_config = config.get_model_evaluation_config()
model_final = ModelEvaluation(logger=logger,experiment_name= 'delivery_prediction_experiemnt_1', repo_owner='onkar-git', repo_name='Delivery-time-prediction-for-food-delivery-industry', config=model_config)

# model_gbm=ModelTr

[2025-02-18 16:54:16,572: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\config\config.yaml loaded successfully]
[2025-02-18 16:54:16,578: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\params.yaml loaded successfully]
[2025-02-18 16:54:16,586: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\schema.yaml loaded successfully]
[2025-02-18 16:54:16,589: INFO: common: created directory at: artifacts]
[2025-02-18 16:54:16,591: INFO: common: created directory at: artifacts/model_trainer/]
[2025-02-18 16:54:17,621: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/onkar-git/Delivery-time-prediction-for-food-delivery-industry "HTTP/1.1 200 OK"]


[2025-02-18 16:54:17,629: INFO: helpers: Initialized MLflow to track repo "onkar-git/Delivery-time-prediction-for-food-delivery-industry"]


[2025-02-18 16:54:17,635: INFO: helpers: Repository onkar-git/Delivery-time-prediction-for-food-delivery-industry initialized!]


In [100]:
model_final.run()

[2025-02-18 16:54:19,698: INFO: 1188005277: Data loaded successfully from artifacts\data_trans\train_trans.csv]
[2025-02-18 16:54:19,868: INFO: 1188005277: Data loaded successfully from artifacts\data_trans\test_trans.csv]
[2025-02-18 16:54:20,947: INFO: 1188005277: Model loaded successfully]


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 479 out of 479 | elapsed:    0.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 479 out of 479 | elapsed:    0.3s finished


[2025-02-18 16:56:03,887: INFO: 1188005277: Model evaluation completed]


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 479 out of 479 | elapsed:    0.1s finished


[2025-02-18 16:57:45,249: INFO: 1188005277: MLflow logging complete]
🏃 View run adventurous-stag-295 at: https://dagshub.com/onkar-git/Delivery-time-prediction-for-food-delivery-industry.mlflow/#/experiments/0/runs/2e0b2791d67d42a0ab0a5ed536fc3bda
🧪 View experiment at: https://dagshub.com/onkar-git/Delivery-time-prediction-for-food-delivery-industry.mlflow/#/experiments/0
[2025-02-18 16:57:45,940: INFO: 1188005277: Model information saved]
