In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import yaml
import logging
from pathlib import Path

TARGET = "time_taken"




def load_data(data_path: Path) -> pd.DataFrame:
    try:
        df = pd.read_csv(data_path)
    
    except FileNotFoundError:
        logger.error("The file to load does not exist")
    
    return df



def split_data(data: pd.DataFrame, test_size: float, random_state: int):
    train_data, test_data = train_test_split(data, 
                                             test_size=test_size, 
                                             random_state=random_state)
    
    return train_data, test_data

def read_params(file_path):
    with open(file_path,"r") as f:
        params_file = yaml.safe_load(f)
    
    return params_file
        
def save_data(data: pd.DataFrame, save_path: Path) -> None:
    data.to_csv(save_path, index=False)
    
    
if __name__ == "__main__":
    # set file paths
    # root path
    root_path = Path(__file__).parent.parent.parent
    # data load path
    data_path = root_path / "data" / "cleaned" / "swiggy_cleaned.csv"
    # save data directory
    save_data_dir = root_path / "data" / "interim"
    # make dir if not preseny
    save_data_dir.mkdir(exist_ok=True,parents=True)
    # train and test data save paths
    # filenames
    train_filename = "train.csv"
    test_filename = "test.csv"
    # save path for train and test
    save_train_path = save_data_dir / train_filename
    save_test_path = save_data_dir / test_filename
    # parameters file
    params_file_path = root_path / "params.yaml"
    
    # load the cleaned data
    df = load_data(data_path)
    logger.info("Data Loaded Successfully")
    
    # read the parameters
    parameters = read_params(params_file_path)['Data_Preparation']
    test_size = parameters['test_size']
    random_state = parameters['random_state']
    logger.info("parameters read successfully")
    
    # split into train and test data
    train_data, test_data = split_data(df,test_size=test_size,random_state=random_state)
    logger.info("Dataset split into train and test data")
    
    # save the train and test data
    data_subsets = [train_data,test_data]
    data_paths = [save_train_path,save_test_path]
    filename_list = [train_filename,test_filename]
    for filename , path, data in zip(filename_list, data_paths, data_subsets):
        save_data(data=data, save_path=path)
        logger.info(f"{filename.replace(".csv","")} data saved to location")

In [1]:
import os

In [2]:
%pwd

'e:\\projects\\Delivery-time-prediction-for-food-devlivery-industry\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'e:\\projects\\Delivery-time-prediction-for-food-devlivery-industry'

In [16]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataPreparationConfig:
    root_dir: Path
    data_input_dir: Path
    train_dir: Path
    test_dir: Path
    params : dict

In [14]:
from pathlib import Path

CONFIG_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\config\config.yaml")
PARAMS_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\params.yaml")
SCHEMA_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\schema.yaml")

In [18]:
from Deliveryprediction.constants import *
from Deliveryprediction.utils.common import read_yaml, create_directories

class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_data_preparation_config(self) -> DataPreparationConfig:
        config = self.config.data_preparation
        params = self.params.PARAMS

        create_directories([config.root_dir])

        data_Preparation_config = DataPreparationConfig(
            root_dir = config.root_dir,
            data_input_dir = config.data_input_dir,
            train_dir = config.train_dir,
            test_dir = config.test_dir,
            params = params    
        )

        return data_Preparation_config


In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
import yaml
from Deliveryprediction import logger
from pathlib import Path


In [34]:

class DataPreparation:

    def __init__(self,config:DataPreparationConfig):
        self.config = config
                
    def load_data(self) -> pd.DataFrame:
        try:
          df = pd.read_csv(self.config.data_input_dir)
        except FileNotFoundError:
            logger.error("The file to load does not exist")
        return df
    def split_data(self, data: pd.DataFrame):
        train_data, test_data = train_test_split(data, 
                                                test_size=self.config.params.test_size, 
                                                random_state=self.config.params.random_state)
        

        train_data.to_csv(os.path.join(self.config.train_dir, "train.csv"),index = False)
        test_data.to_csv(os.path.join(self.config.test_dir, "test.csv"),index = False)

        # return train_data, test_data
  


In [5]:
import os

In [22]:
pwd

'e:\\projects\\Delivery-time-prediction-for-food-devlivery-industry'

In [36]:
config = ConfigurationManager()
data_preparation_config = config.get_data_preparation_config()
data_preparation = DataPreparation(config=data_preparation_config)
data=data_preparation.load_data()
data_preparation.split_data(data)

[2025-02-11 09:52:25,700: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\config\config.yaml loaded successfully]
[2025-02-11 09:52:25,705: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\params.yaml loaded successfully]
[2025-02-11 09:52:25,712: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\schema.yaml loaded successfully]
[2025-02-11 09:52:25,715: INFO: common: created directory at: artifacts]
[2025-02-11 09:52:25,718: INFO: common: created directory at: artifacts/data_preparation]


In [32]:
test_data

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "e:\projects\Delivery-time-prediction-for-food-devlivery-industry\.venv\lib\site-packages\IPython\core\interactiveshell.py", line 3579, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Admin\AppData\Local\Temp\ipykernel_10168\2348561343.py", line 1, in <module>
    test_data
NameError: name 'test_data' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "e:\projects\Delivery-time-prediction-for-food-devlivery-industry\.venv\lib\site-packages\IPython\core\interactiveshell.py", line 2170, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "e:\projects\Delivery-time-prediction-for-food-devlivery-industry\.venv\lib\site-packages\IPython\core\ultratb.py", line 1457, in structured_traceback
    return FormattedTB.structured_traceback(
  File "e:\projects\Delivery-time-prediction-for-food-devlivery-industry\.venv\lib\sit