# Dealing with Date time features

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

from datetime import datetime
import copy

In [2]:
df = pd.read_csv("../dataset/cat_data.csv")

In [3]:
df.shape

(45593, 20)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_ID           45593 non-null  object 
 1   Delivery_person_Age          43739 non-null  float64
 2   Delivery_person_Ratings      43685 non-null  float64
 3   Restaurant_latitude          45593 non-null  float64
 4   Restaurant_longitude         45593 non-null  float64
 5   Delivery_location_latitude   45593 non-null  float64
 6   Delivery_location_longitude  45593 non-null  float64
 7   Order_Date                   45593 non-null  object 
 8   Time_Orderd                  45593 non-null  object 
 9   Time_Order_picked            45593 non-null  object 
 10  Weather                      45593 non-null  object 
 11  Road_traffic_density         45593 non-null  object 
 12  Vehicle_condition            45593 non-null  int64  
 13  Type_of_order   

- Creating feature by aggregation of other features with time.
- Experimenting with tsfresh.

In [5]:
def compute_date_time(df):
    "Calculates the difference between order pickup time and time ordered in minutes"
    df = copy.deepcopy(df)
    df["Time_Orderd"] = df["Time_Orderd"].apply(lambda x: datetime.strptime(x, "%H:%M") if x != "NONE" else x)
    df["Time_Order_picked"] = df["Time_Order_picked"].apply(lambda x: datetime.strptime(x, "%H:%M"))
    
    df.loc[:, "pickup_delay"] = -1
    
    for idx in range(len(df)):
        time_ordered = df.loc[idx, "Time_Orderd"]
        if time_ordered == "NONE":
            df.loc[idx, "pickup_delay"] = None
        else:
            time_order_picked = df.loc[idx, "Time_Order_picked"]
            delay_sec = (time_order_picked - time_ordered).seconds
            df.loc[idx, "pickup_delay"] = delay_sec/60
    
    df.loc[:, "order_hour"] = df["Time_Order_picked"].apply(lambda x: x.hour)
    df.loc[:, "order_minute"] = df["Time_Orderd"].apply(lambda x: None if type(x)==str else x.minute)
    df.loc[:, "order_minute_picked"] = df["Time_Order_picked"].apply(lambda x: x.minute)
    
    df = df.drop(["Time_Orderd", "Time_Order_picked"], axis=1)
    
    return df

In [6]:
df2 = compute_date_time(df)

In [7]:
df2.shape

(45593, 22)

In [8]:
df2.head()

Unnamed: 0,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Weather,Road_traffic_density,...,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken,kfold,pickup_delay,order_hour,order_minute,order_minute_picked
0,MUMRES13DEL03,22.0,4.7,19.178321,72.834715,19.208321,72.864715,24-03-2022,Stormy,Medium,...,scooter,1.0,No,Metropolitian,24.0,0,10.0,18,35.0,45
1,LUDHRES01DEL02,21.0,4.9,30.873988,75.842739,31.003988,75.972739,16-02-2022,Cloudy,Jam,...,motorcycle,1.0,No,Metropolitian,38.0,0,15.0,22,50.0,5
2,MUMRES18DEL02,31.0,4.1,19.1093,72.825451,19.1293,72.845451,15-03-2022,Sandstorms,Low,...,motorcycle,2.0,No,Urban,32.0,0,5.0,9,45.0,50
3,INDORES15DEL01,35.0,4.9,22.760072,75.892574,22.830072,75.962574,06-03-2022,Fog,Jam,...,scooter,2.0,No,Metropolitian,35.0,0,10.0,20,35.0,45
4,DEHRES02DEL01,28.0,4.8,0.0,0.0,0.04,0.04,11-02-2022,Sunny,High,...,scooter,1.0,No,Urban,15.0,0,10.0,14,0.0,10


In [9]:
df2['pickup_delay'].value_counts()

5.0     14703
15.0    14610
10.0    14549
Name: pickup_delay, dtype: int64

In [10]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_ID           45593 non-null  object 
 1   Delivery_person_Age          43739 non-null  float64
 2   Delivery_person_Ratings      43685 non-null  float64
 3   Restaurant_latitude          45593 non-null  float64
 4   Restaurant_longitude         45593 non-null  float64
 5   Delivery_location_latitude   45593 non-null  float64
 6   Delivery_location_longitude  45593 non-null  float64
 7   Order_Date                   45593 non-null  object 
 8   Weather                      45593 non-null  object 
 9   Road_traffic_density         45593 non-null  object 
 10  Vehicle_condition            45593 non-null  int64  
 11  Type_of_order                45593 non-null  object 
 12  Type_of_vehicle              45593 non-null  object 
 13  multiple_deliver

In [11]:
def check_null(df):
    '''Returns % null values in each column'''
    return round(100*df.isnull().sum()/len(df), 2).sort_values(ascending=False)

In [12]:
check_null(df2)

Delivery_person_Ratings        4.18
Delivery_person_Age            4.07
order_minute                   3.80
pickup_delay                   3.80
multiple_deliveries            2.18
Delivery_person_ID             0.00
Type_of_vehicle                0.00
order_hour                     0.00
kfold                          0.00
Time_taken                     0.00
City                           0.00
Festival                       0.00
Type_of_order                  0.00
Vehicle_condition              0.00
Road_traffic_density           0.00
Weather                        0.00
Order_Date                     0.00
Delivery_location_longitude    0.00
Delivery_location_latitude     0.00
Restaurant_longitude           0.00
Restaurant_latitude            0.00
order_minute_picked            0.00
dtype: float64

### Catboost Performance

In [14]:
#changing to datetime data type
order_date = pd.to_datetime(df2['Order_Date'], dayfirst=True)

In [15]:
np.unique(order_date.dt.year)

array([2022], dtype=int64)

In [16]:
df2.shape

(45593, 22)

In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_ID           45593 non-null  object 
 1   Delivery_person_Age          43739 non-null  float64
 2   Delivery_person_Ratings      43685 non-null  float64
 3   Restaurant_latitude          45593 non-null  float64
 4   Restaurant_longitude         45593 non-null  float64
 5   Delivery_location_latitude   45593 non-null  float64
 6   Delivery_location_longitude  45593 non-null  float64
 7   Order_Date                   45593 non-null  object 
 8   Weather                      45593 non-null  object 
 9   Road_traffic_density         45593 non-null  object 
 10  Vehicle_condition            45593 non-null  int64  
 11  Type_of_order                45593 non-null  object 
 12  Type_of_vehicle              45593 non-null  object 
 13  multiple_deliver

In [18]:
df2.loc[:, "month"] = order_date.apply(lambda x: x.month)
df2.loc[:, 'day_of_week'] = order_date.apply(lambda x: x.dayofweek)
df2.loc[:, 'week_of_year'] = order_date.apply(lambda x: x.weekofyear)
df2.loc[:, 'weekend'] = order_date.apply(lambda x: x.dayofweek>=5).astype(np.int8)

In [19]:
df2 = df2.drop(["Order_Date"], axis=1)

In [20]:
df2.shape

(45593, 25)

In [21]:
def bin_time(hour):
    if hour>=2 and hour<6:
        return "dawn"
    elif hour >= 6 and hour<10:
        return "morning"
    elif hour >= 10 and hour<14:
        return "noon"
    elif hour >=14 and hour <18:
        return "afternoon"
    elif hour >= 18 and hour<22:
        return "evening"
    else:
        return "midnight"

In [22]:
df2.loc[:, "time_of_day"] = df2["order_hour"].apply(bin_time)

In [23]:
df2.shape

(45593, 26)

In [24]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Delivery_person_ID           45593 non-null  object 
 1   Delivery_person_Age          43739 non-null  float64
 2   Delivery_person_Ratings      43685 non-null  float64
 3   Restaurant_latitude          45593 non-null  float64
 4   Restaurant_longitude         45593 non-null  float64
 5   Delivery_location_latitude   45593 non-null  float64
 6   Delivery_location_longitude  45593 non-null  float64
 7   Weather                      45593 non-null  object 
 8   Road_traffic_density         45593 non-null  object 
 9   Vehicle_condition            45593 non-null  int64  
 10  Type_of_order                45593 non-null  object 
 11  Type_of_vehicle              45593 non-null  object 
 12  multiple_deliveries          44600 non-null  float64
 13  Festival        

In [25]:
check_null(df2)

Delivery_person_Ratings        4.18
Delivery_person_Age            4.07
order_minute                   3.80
pickup_delay                   3.80
multiple_deliveries            2.18
Delivery_person_ID             0.00
Time_taken                     0.00
weekend                        0.00
week_of_year                   0.00
day_of_week                    0.00
month                          0.00
order_minute_picked            0.00
order_hour                     0.00
kfold                          0.00
Festival                       0.00
City                           0.00
Type_of_vehicle                0.00
Type_of_order                  0.00
Vehicle_condition              0.00
Road_traffic_density           0.00
Weather                        0.00
Delivery_location_longitude    0.00
Delivery_location_latitude     0.00
Restaurant_longitude           0.00
Restaurant_latitude            0.00
time_of_day                    0.00
dtype: float64

## Catboost Performance

#### minute improvement

### These trignometric features can be useful for linear models

In [26]:
def sin_cos_transform(val, period):
    sin_ftr = np.sin(2*np.pi*(val/period))
    cos_ftr = np.cos(2*np.pi*(val/period))
    
    return sin_ftr, cos_ftr

In [27]:
def compute_trig_ftrs(df):
    df = copy.deepcopy(df)
    
    ftrs = [("month", 12), ('day_of_week', 7), ('week_of_year', 52),
            ('order_hour', 12), ("order_minute_picked", 60)]
    
    for ftr,period in ftrs:
        sin_ftr, cos_ftr = sin_cos_transform(df[ftr].values, period)
        df.loc[:, ftr+"_sin"] = sin_ftr
        df.loc[:, ftr+"_cos"] = cos_ftr
        
    return df

In [28]:
df3 = compute_trig_ftrs(df2)