In [1]:
import os

In [2]:
%pwd

'e:\\projects\\Delivery-time-prediction-for-food-devlivery-industry\\research'

In [4]:
os.chdir("../")

In [21]:
%pwd

'e:\\projects\\Delivery-time-prediction-for-food-devlivery-industry'

In [30]:
from pathlib import Path
from dataclasses import dataclass

@dataclass(frozen=True)
class DataTransformerConfig:
    root_dir: Path
    data_input_dir: Path
    data_tran_dir: Path

In [31]:
from pathlib import Path

CONFIG_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\config\config.yaml")
PARAMS_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\params.yaml")
SCHEMA_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\schema.yaml")

In [32]:
from Deliveryprediction.constants import *
from Deliveryprediction.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_trans_config(self) -> DataTransformerConfig:
        config = self.config.data_transformation
        

        create_directories([config.root_dir])

        data_Transformation_config = DataTransformerConfig(
            root_dir = config.root_dir,
            data_input_dir = config.data_input_dir,
            data_tran_dir = config.data_tran_dir
        )

        return data_Transformation_config


In [33]:
import pandas as pd
import logging
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder, 
    MinMaxScaler, 
    OrdinalEncoder)
import joblib
from sklearn import set_config

# set the transformer outputs to pandas
set_config(transform_output='pandas')


In [57]:
from pathlib import Path
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
import logging

logger = logging.getLogger(__name__)

class DataTransformation:
    def __init__(self, config :DataTransformerConfig):
        self.config = config
        self.root_path = Path(self.config.data_input_dir)
        self.train_data_path = self.root_path / "train.csv"
        self.test_data_path = self.root_path / "test.csv"
        self.save_data_dir = Path(self.config.data_tran_dir)
        self.save_data_dir.mkdir(exist_ok=True, parents=True)
        self.train_trans_filename = "train_trans.csv"
        self.test_trans_filename = "test_trans.csv"
        self.save_train_trans_path = self.save_data_dir / self.train_trans_filename
        self.save_test_trans_path = self.save_data_dir / self.test_trans_filename
        self.transformer_save_dir = self.save_data_dir / "models"
        self.transformer_save_dir.mkdir(exist_ok=True)
        
        self.num_cols = ["age", "ratings", "pickup_time_minutes", "distance"]
        self.nominal_cat_cols = ['weather', 'type_of_order', 'type_of_vehicle', 'festival',
                                 'city_type', 'is_weekend', 'order_time_of_day']
        self.ordinal_cat_cols = ['traffic', 'distance_type']
        self.target_col = 'time_taken'
        self.traffic_order = ["low", "medium", "high", "jam"]
        self.distance_type_order = ["short", "medium", "long", "very_long"]

        self.preprocessor = ColumnTransformer(
            transformers=[
                ("scale", MinMaxScaler(), self.num_cols),
                ("nominal_encode", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), self.nominal_cat_cols),
                ("ordinal_encode", OrdinalEncoder(categories=[self.traffic_order, self.distance_type_order],
                                                  encoded_missing_value=-999, handle_unknown="use_encoded_value",
                                                  unknown_value=-1), self.ordinal_cat_cols)
            ],
            remainder="passthrough",
            n_jobs=-1,
            verbose_feature_names_out=False
        )

    def load_data(self, data_path: Path) -> pd.DataFrame:
        try:
            df = pd.read_csv(data_path)
            return df
        except FileNotFoundError:
            logger.error("The file to load does not exist")
            return pd.DataFrame()

    def drop_missing_values(self, data: pd.DataFrame) -> pd.DataFrame:
        logger.info(f"The original dataset has {data.shape[0]} rows and {data.shape[1]} columns")
        df_dropped = data.dropna()
        logger.info(f"The dataset after dropping missing values has {df_dropped.shape[0]} rows and {df_dropped.shape[1]} columns")
        return df_dropped

    def make_X_and_y(self, data: pd.DataFrame):
        X = data.drop(columns=[self.target_col])
        y = data[self.target_col]
        return X, y

    def train_preprocessor(self, data: pd.DataFrame):
        self.preprocessor.fit(data)
        return self.preprocessor

    def perform_transformations(self, data: pd.DataFrame):
        return self.preprocessor.transform(data)

    def join_X_and_y(self, X: pd.DataFrame, y: pd.Series):
        return X.join(y, how='inner')

    def save_data(self, data: pd.DataFrame, save_path: Path):
        data.to_csv(save_path, index=False)

    def save_transformer(self):
        joblib.dump(self.preprocessor, self.transformer_save_dir / "preprocessor.joblib")

    def run_transformation_pipeline(self):
        train_df = self.drop_missing_values(self.load_data(self.train_data_path))
        logger.info("Train data loaded successfully")
        test_df = self.drop_missing_values(self.load_data(self.test_data_path))
        logger.info("Test data loaded successfully")
        
        X_train, y_train = self.make_X_and_y(train_df)
        X_test, y_test = self.make_X_and_y(test_df)
        logger.info("Data splitting completed")

        self.train_preprocessor(X_train)
        logger.info("Preprocessor is trained")

        X_train_trans = self.perform_transformations(X_train)
        logger.info("Train data is transformed")
        X_test_trans = self.perform_transformations(X_test)
        logger.info("Test data is transformed")

        train_trans_df = self.join_X_and_y(pd.DataFrame(X_train_trans), y_train)
        test_trans_df = self.join_X_and_y(pd.DataFrame(X_test_trans), y_test)
        logger.info("Datasets joined")

        self.save_data(train_trans_df, self.save_train_trans_path)
        self.save_data(test_trans_df, self.save_test_trans_path)
        logger.info("Transformed data saved")

        self.save_transformer()
        logger.info("Preprocessor saved")

In [58]:
try:
    config = ConfigurationManager()
    data_validation_config = config.get_data_trans_config()
    data_validation = DataTransformation(config=data_validation_config)
except Exception as e:
    raise e

[2025-02-11 17:57:42,532: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\config\config.yaml loaded successfully]
[2025-02-11 17:57:42,535: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\params.yaml loaded successfully]
[2025-02-11 17:57:42,540: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\schema.yaml loaded successfully]
[2025-02-11 17:57:42,543: INFO: common: created directory at: artifacts]
[2025-02-11 17:57:42,547: INFO: common: created directory at: artifacts/data_trans/]


In [59]:
data_validation.run_transformation_pipeline()

[2025-02-11 17:57:44,986: INFO: 77540092: The original dataset has 28271 rows and 16 columns]
[2025-02-11 17:57:45,013: INFO: 77540092: The dataset after dropping missing values has 28271 rows and 16 columns]
[2025-02-11 17:57:45,017: INFO: 77540092: Train data loaded successfully]
[2025-02-11 17:57:45,083: INFO: 77540092: The original dataset has 9424 rows and 16 columns]
[2025-02-11 17:57:45,096: INFO: 77540092: The dataset after dropping missing values has 9424 rows and 16 columns]
[2025-02-11 17:57:45,098: INFO: 77540092: Test data loaded successfully]
[2025-02-11 17:57:45,106: INFO: 77540092: Data splitting completed]
[2025-02-11 17:57:45,285: INFO: 77540092: Preprocessor is trained]
[2025-02-11 17:57:45,422: INFO: 77540092: Train data is transformed]
[2025-02-11 17:57:45,488: INFO: 77540092: Test data is transformed]
[2025-02-11 17:57:45,504: INFO: 77540092: Datasets joined]
[2025-02-11 17:57:46,676: INFO: 77540092: Transformed data saved]
[2025-02-11 17:57:46,686: INFO: 77540092

In [14]:
from pathlib import Path
import pandas as pd
import joblib
import logging

logger = logging.getLogger(__name__)

class DataTransformation:
        
    def __init__(self, config):
        self.config = config  # ✅ Fix: Use self.config

    num_cols = ["age", "ratings", "pickup_time_minutes", "distance"]
    nominal_cat_cols = ['weather', 'type_of_order', 'type_of_vehicle', 'festival', 'city_type', 'is_weekend', 'order_time_of_day']
    ordinal_cat_cols = ['traffic', 'distance_type']
    target_col = 'time_taken'

    # Order for ordinal encoding
    traffic_order = ["low", "medium", "high", "jam"]
    distance_type_order = ["short", "medium", "long", "very_long"]

    def load_data(self, data_path: Path) -> pd.DataFrame:
        try:
            df = pd.read_csv(data_path)  # ✅ Fix: Pass file path
            return df
        except FileNotFoundError:
            logger.error(f"The file {data_path} does not exist")
            return pd.DataFrame()  # ✅ Fix: Return an empty DataFrame

    def drop_missing_values(self, data: pd.DataFrame) -> pd.DataFrame:
        logger.info(f"Original dataset: {data.shape[0]} rows, {data.shape[1]} columns")
        df_dropped = data.dropna()
        logger.info(f"After dropping missing values: {df_dropped.shape[0]} rows, {df_dropped.shape[1]} columns")
        
        if df_dropped.isna().sum().sum() > 0:
            raise ValueError("The dataframe still has missing values!")
        
        return df_dropped

    def run_transformation_pipeline(self):
        """ Load, clean, and preprocess data """
        input_path = Path(self.config.data_input_dir)  # ✅ Fix: Use self.config
        data_path = input_path / "train.csv"

        # Load data
        train_data = self.load_data(data_path)
        print(train_data.shape)

        # Drop missing values
        train_df = self.drop_missing_values(data=train_data)


# ✅ Usage

