In [9]:
import os


In [10]:
%pwd

'e:\\projects\\Delivery-time-prediction-for-food-devlivery-industry\\research'

In [11]:
import numpy as np
import pandas as pd
from pathlib import Path

columns_to_drop =  ['rider_id',
                    'restaurant_latitude',
                    'restaurant_longitude',
                    'delivery_latitude',
                    'delivery_longitude',
                    'order_date',
                    "order_time_hour",
                    "order_day",
                    "city_name",
                    "order_day_of_week",
                    "order_month"]


def load_data(data_path: Path) -> pd.DataFrame:
    try:
        df = pd.read_csv(data_path)
    
    except FileNotFoundError:
        logger.error("The file to load does not exist")
    
    return df


def change_column_names(data: pd.DataFrame) -> pd.DataFrame:
    return (
        data.rename(str.lower,axis=1)
        .rename({
            "delivery_person_id" : "rider_id",
            "delivery_person_age": "age",
            "delivery_person_ratings": "ratings",
            "delivery_location_latitude": "delivery_latitude",
            "delivery_location_longitude": "delivery_longitude",
            "time_orderd": "order_time",
            "time_order_picked": "order_picked_time",
            "weatherconditions": "weather",
            "road_traffic_density": "traffic",
            "city": "city_type",
            "time_taken(min)": "time_taken"},axis=1)
    )


def data_cleaning(data: pd.DataFrame) -> pd.DataFrame:
    minors_data = data.loc[data['age'].astype('float') < 18]
    minor_index = minors_data.index.tolist()
    six_star_data = data.loc[data['ratings'] == "6"]
    six_star_index = six_star_data.index.tolist()

    return (
        data
        .drop(columns="id")
        .drop(index=minor_index)                                                # Minor riders in data dropped
        .drop(index=six_star_index)                                             # six star rated drivers dropped
        .replace("NaN ",np.nan)                                                 # missing values in the data
        .assign(
            # city column out of rider id
            city_name = lambda x: x['rider_id'].str.split("RES").str.get(0),
            # convert age to float
            age = lambda x: x['age'].astype(float),
            # convert ratings to float
            ratings = lambda x: x['ratings'].astype(float),
            # absolute values for location based columns
            restaurant_latitude = lambda x: x['restaurant_latitude'].abs(),
            restaurant_longitude = lambda x: x['restaurant_longitude'].abs(),
            delivery_latitude = lambda x: x['delivery_latitude'].abs(),
            delivery_longitude = lambda x: x['delivery_longitude'].abs(),
            # order date to datetime and feature extraction
            order_date = lambda x: pd.to_datetime(x['order_date'],
                                                  dayfirst=True),
            order_day = lambda x: x['order_date'].dt.day,
            order_month = lambda x: x['order_date'].dt.month,
            order_day_of_week = lambda x: x['order_date'].dt.day_name().str.lower(),
            is_weekend = lambda x: (x['order_date']
                                    .dt.day_name()
                                    .isin(["Saturday","Sunday"])
                                    .astype(int)),
            # time based columns
            order_time = lambda x: pd.to_datetime(x['order_time'],
                                                  format='mixed'),
            order_picked_time = lambda x: pd.to_datetime(x['order_picked_time'],
                                                         format='mixed'),
            # time taken to pick order
            pickup_time_minutes = lambda x: (
                                            (x['order_picked_time'] - x['order_time'])
                                            .dt.seconds / 60
                                            ),
            # hour in which order was placed
            order_time_hour = lambda x: x['order_time'].dt.hour,
            # time of the day when order was placed
            order_time_of_day = lambda x: (
                                x['order_time_hour'].pipe(time_of_day)),
            # categorical columns
            weather = lambda x: (
                                x['weather']
                                .str.replace("conditions ","")
                                .str.lower()
                                .replace("nan",np.nan)),
            traffic = lambda x: x["traffic"].str.rstrip().str.lower(),
            type_of_order = lambda x: x['type_of_order'].str.rstrip().str.lower(),
            type_of_vehicle = lambda x: x['type_of_vehicle'].str.rstrip().str.lower(),
            festival = lambda x: x['festival'].str.rstrip().str.lower(),
            city_type = lambda x: x['city_type'].str.rstrip().str.lower(),
            # multiple deliveries column
            multiple_deliveries = lambda x: x['multiple_deliveries'].astype(float),
            # target column modifications
            time_taken = lambda x: (x['time_taken']
                                    .str.replace("(min) ","")
                                    .astype(int)))
        .drop(columns=["order_time","order_picked_time"])
    )
    
    
    
def clean_lat_long(data: pd.DataFrame, threshold: float=1.0) -> pd.DataFrame:
    location_columns = ['restaurant_latitude',
                        'restaurant_longitude',
                        'delivery_latitude',
                        'delivery_longitude']

    return (
        data
        .assign(**{
            col: (
                np.where(data[col] < threshold, np.nan, data[col].values)
            )
            for col in location_columns
        })
    )
    
    
# extract day, day name, month and year
def extract_datetime_features(ser: pd.Series) -> pd.DataFrame:
    date_col = pd.to_datetime(ser,dayfirst=True)

    return (
        pd.DataFrame(
            {
                "day": date_col.dt.day,
                "month": date_col.dt.month,
                "year": date_col.dt.year,
                "day_of_week": date_col.dt.day_name(),
                "is_weekend": date_col.dt.day_name().isin(["Saturday","Sunday"]).astype(int)
            }
        ))
    

    
def time_of_day(ser: pd.Series):

    return(
        pd.cut(ser,bins=[0,6,12,17,20,24],right=True,
               labels=["after_midnight","morning","afternoon","evening","night"])
    )



def calculate_haversine_distance(df: pd.DataFrame) -> pd.DataFrame:
    location_columns = ['restaurant_latitude',
                        'restaurant_longitude',
                        'delivery_latitude',
                        'delivery_longitude']
    
    lat1 = df[location_columns[0]]
    lon1 = df[location_columns[1]]
    lat2 = df[location_columns[2]]
    lon2 = df[location_columns[3]]

    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(
        dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    distance = 6371 * c

    return (
        df.assign(
            distance = distance)
    )


def create_distance_type(data: pd.DataFrame) -> pd.DataFrame:
    return(
        data
        .assign(
                distance_type = pd.cut(data["distance"],bins=[0,5,10,15,25],
                                        right=False,labels=["short","medium","long","very_long"])
    ))



def drop_columns(data: pd.DataFrame, columns: list) -> pd.DataFrame:
    df = data.drop(columns=columns)
    return df
 
    
    


In [12]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
cwd = Path.cwd()  # Get current working directory
root_path = cwd.parent
# root_path = Path(__file__).parent.parent.parent
# print(root_path)\
def load_data(data_path: str) -> pd.DataFrame:
    df = pd.read_csv(data_path)
    return df
cleaned_data_save_dir = root_path / "data" /"cleaned"

cleaned_data_save_dir.mkdir(exist_ok=True,parents=True)

cleaned_data_filename = "swiggy_cleaned.csv"

cleaned_data_save_path = cleaned_data_save_dir / cleaned_data_filename

data_load_path = root_path / "data" / "raw" /"swiggy.csv"

#data_load_path = "https://raw.githubusercontent.com/onkar-git/Delivery-time-prediction-for-food-delivery-industry/refs/heads/main/swiggy.csv"

#data_load_path = root_path / "data" / "raw" /"swiggy.csv"

df = load_data(data_load_path)


#perform_data_cleaning





In [13]:

def perform_data_cleaning(data: pd.DataFrame, saved_data_path: Path) -> None:
    
    cleaned_data = (
        data
        .pipe(change_column_names)
        .pipe(data_cleaning)
        .pipe(clean_lat_long)
        .pipe(calculate_haversine_distance)
        .pipe(create_distance_type)
        .pipe(drop_columns,columns=columns_to_drop)
    )
    return cleaned_data.dropna()

    # save the data
    cleaned_data.to_csv(saved_data_path,index=False)
saved_data_path=cleaned_data_save_path
cleaned_data=perform_data_cleaning(data=df,saved_data_path=cleaned_data_save_path)



In [14]:
cleaned_data.to_csv(saved_data_path,index=False)

In [37]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class DataIngestionConfig:
    root_dir: Path
    source_URL: Path
    local_data_file: Path
    unzip_dir: Path

In [78]:
from Deliveryprediction.constants import *
from Deliveryprediction.utils.common import read_yaml, create_directories


In [85]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_data_ingestion_config(self) -> DataIngestionConfig:
        config = self.config.data_ingestion

        create_directories([config.root_dir])

        data_ingestion_config = DataIngestionConfig(
            root_dir=config.root_dir,
            source_URL=config.source_URL,
            local_data_file=config.local_data_file,
            unzip_dir=config.unzip_dir 
        )

        return data_ingestion_config

    

In [90]:
import os
import urllib.request as request
import zipfile
from Deliveryprediction import logger
from Deliveryprediction.utils.common import get_size

In [95]:
class DataIngestion:
    def __init__(self, config: DataIngestionConfig):
        self.config = config


    
    def download_file(self):
        if not os.path.exists(self.config.local_data_file):
            filename, headers = request.urlretrieve(
                url = self.config.source_URL,
                filename = self.config.local_data_file
            )
            logger.info(f"{filename} download! with following info: \n{headers}")
        else:
            logger.info(f"File already exists of size: {get_size(Path(self.config.local_data_file))}")



    
    def extract_zip_file(self):
        """
        zip_file_path: str
        Extracts the zip file into the data directory
        Function returns None
        """
        unzip_path = self.config.unzip_dir
        os.makedirs(unzip_path, exist_ok=True)
        with zipfile.ZipFile(self.config.local_data_file, 'r') as zip_ref:
            zip_ref.extractall(unzip_path)

In [96]:
from pathlib import Path

CONFIG_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\config\config.yaml")
PARAMS_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\params.yaml")
SCHEMA_FILE_PATH = Path("E:\projects\Delivery-time-prediction-for-food-devlivery-industry\schema.yaml")

In [98]:
try:
    config = ConfigurationManager()
    data_ingestion_config = config.get_data_ingestion_config()
    data_ingestion = DataIngestion(config=data_ingestion_config)
    data_ingestion.download_file()
    #data_ingestion.extract_zip_file()
except Exception as e:
    raise e

[2025-02-08 14:38:09,179: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\config\config.yaml loaded successfully]
[2025-02-08 14:38:09,182: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\params.yaml loaded successfully]
[2025-02-08 14:38:09,184: INFO: common: yaml file: E:\projects\Delivery-time-prediction-for-food-devlivery-industry\schema.yaml loaded successfully]
[2025-02-08 14:38:09,186: INFO: common: created directory at: artifacts]
[2025-02-08 14:38:09,187: INFO: common: created directory at: artifacts/data_ingestion]
[2025-02-08 14:38:11,034: INFO: 3447005916: artifacts/data_ingestion/data.csv download! with following info: 
Connection: close
Content-Length: 7772212
Cache-Control: max-age=300
Content-Security-Policy: default-src 'none'; style-src 'unsafe-inline'; sandbox
Content-Type: text/plain; charset=utf-8
ETag: "86298f701a3f9ded8ac62fcba0c8565015e96ff7ca601d0c221444020783af2e"
Strict-Transp

In [45]:
import os
from box.exceptions import BoxValueError
import yaml
from Deliveryprediction import logger
import json
import joblib
from ensure import ensure_annotations
from box import ConfigBox
from pathlib import Path
from typing import Any



@ensure_annotations
def read_yaml(path_to_yaml: Path) -> ConfigBox:
    """reads yaml file and returns

    Args:
        path_to_yaml (str): path like input

    Raises:
        ValueError: if yaml file is empty
        e: empty file

    Returns:
        ConfigBox: ConfigBox type
    """
    try:
        with open(path_to_yaml) as yaml_file:
            content = yaml.safe_load(yaml_file)
            logger.info(f"yaml file: {path_to_yaml} loaded successfully")
            return ConfigBox(content)
    except BoxValueError:
        raise ValueError("yaml file is empty")
    except Exception as e:
        raise e
    

In [52]:
! pip install box


Collecting box
  Downloading box-0.1.5-py3-none-any.whl.metadata (1.6 kB)
Collecting aiohttp>=3.8.1 (from box)
  Downloading aiohttp-3.10.11-cp38-cp38-win_amd64.whl.metadata (8.0 kB)
Collecting columnar==1.3.1 (from box)
  Downloading Columnar-1.3.1-py3-none-any.whl.metadata (11 kB)
Collecting executing==0.8.2 (from box)
  Downloading executing-0.8.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting loguru (from box)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting python-dateutil==2.8.2 (from box)
  Downloading python_dateutil-2.8.2-py2.py3-none-any.whl.metadata (8.2 kB)
Collecting timeago==1.0.14 (from box)
  Downloading timeago-1.0.14.tar.gz (24 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting toolz (from columnar==1.3.1->box)
  Using cached toolz-1.0.0-py3-none-any.whl.metadata (5.1 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp>=3.8.1->box)
  Downloading aiohappyeyeballs-2.4.4-py3-none

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
stack-data 0.6.2 requires executing>=1.2.0, but you have executing 0.8.2 which is incompatible.


In [61]:
config_filepath = Path("config/config.yaml")
config = read_yaml(config_filepath)

[2025-02-08 11:22:58,840: ERROR: 1539134077: Error reading YAML: YAML file config\config.yaml not found.]


FileNotFoundError: YAML file config\config.yaml not found.

In [56]:
config

ConfigBox({'key': 'val'})