# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set Globals & Functions

In [2]:
def checkData(x):
    print(f"\nInfo dos valores:\n")
    print(x.info())
    print(f"\nDescrição dos dados:\n\n{x.describe()}")
    return

def checkNaN(x):
    """This function receives a DataFrame and returns the count of NaN values for each column."""
    null_count = x.isna().sum()
    non_null_count = x.notnull().sum()
    print(f"Null values:\n {null_count}")
    print(f"Non-null values:\n {non_null_count}")
    return

def checkOutliers(x):
    """This functions receives a DataFrame and returns a DataFrame with the outliers.
    It takes as premise that the data is normally distributed."""
    dfOutliers  = pd.DataFrame()
    # Itera sobre as colunas númericas
    for column in x.select_dtypes(include=[np.number]).columns:
        # Calcula o 1 quartil (Q1)
        Q1 = x[column].quantile(0.25)
        # Calcula o 3 quartil (Q3)
        Q3 = x[column].quantile(0.75)
        # Calcula o Intervalo Interquartil (IQR)
        IQR = Q3 - Q1
        # Define os limites inferior e superior para outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Identica os outliers da coluna atual e adiciona ao DataFrame de outliers
        outliers = x[(x[column] < lower_bound) | (x[column] > upper_bound)]
        # Concatene os outliers da coluna atual ao DataFrame de outliers
        dfOutliers = pd.concat([dfOutliers, outliers])
    outliersData = dfOutliers.drop_duplicates().reset_index(drop=True)
    return outliersData

def corr(x):
    plt.figure(figsize=(10,8))
    sns.heatmap(x.select_dtypes(include=[np.number]).corr(), annot=True)
    plt.show()
    return

def hist(x):
    """This function receives a DataFrame and returns a histogram of the data."""
    x.hist(bins=50, figsize=(25,10))
    plt.show()
    return




# Data Loading & Check

In [3]:
df_raw = pd.read_csv("train.csv")

In [4]:
checkData(df_raw)


Info dos valores:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4467 entries, 0 to 4466
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           4467 non-null   object 
 1   Delivery_person_ID           4467 non-null   object 
 2   Delivery_person_Age          4467 non-null   object 
 3   Delivery_person_Ratings      4467 non-null   object 
 4   Restaurant_latitude          4467 non-null   float64
 5   Restaurant_longitude         4467 non-null   float64
 6   Delivery_location_latitude   4467 non-null   float64
 7   Delivery_location_longitude  4467 non-null   float64
 8   Order_Date                   4467 non-null   object 
 9   Time_Orderd                  4467 non-null   object 
 10  Time_Order_picked            4467 non-null   object 
 11  Weatherconditions            4467 non-null   object 
 12  Road_traffic_density         4467 non-null   object 
 13

In [5]:
df_raw.sample(30)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
4174,0x6354,RANCHIRES08DEL02,25,4.7,23.353783,85.326967,23.433783,85.406967,04-04-2022,23:00:00,23:10:00,conditions Sandstorms,Low,1,Buffet,scooter,0.0,No,Metropolitian,(min) 19
1076,0x699a,PUNERES010DEL02,29,4.7,18.539299,73.897902,18.589299,73.947902,17-03-2022,18:25:00,18:30:00,conditions Windy,Medium,2,Meal,scooter,0.0,No,,(min) 21
3892,0x419c,RANCHIRES06DEL01,28,4.7,23.357804,85.325146,23.397804,85.365146,05-04-2022,12:40:00,12:55:00,conditions Sandstorms,High,0,Snack,motorcycle,0.0,No,Urban,(min) 38
2497,0x6f8e,MYSRES04DEL01,22,4.6,12.3085,76.665808,12.3185,76.675808,13-03-2022,08:40:00,08:55:00,conditions Stormy,Low,2,Snack,electric_scooter,1.0,No,Urban,(min) 14
2500,0x1a9e,JAPRES11DEL03,35,4.9,26.90294,75.793007,26.96294,75.853007,09-03-2022,23:35:00,23:40:00,conditions Sunny,Low,2,Drinks,scooter,1.0,No,Metropolitian,(min) 16
4223,0x8c98,JAPRES01DEL03,34,4.9,26.90519,75.810753,26.93519,75.840753,17-03-2022,21:50:00,22:00:00,conditions Sunny,Jam,2,Meal,electric_scooter,,No,Metropolitian,(min) 15
3291,0x6532,VADRES15DEL03,29,4.6,0.0,0.0,0.09,0.09,20-03-2022,18:55:00,19:00:00,conditions Cloudy,Medium,1,Buffet,motorcycle,0.0,No,Urban,(min) 35
1398,0x6569,INDORES01DEL02,25,4.9,22.695207,75.866059,22.745207,75.916059,19-03-2022,23:10:00,23:25:00,conditions Stormy,Low,1,Meal,scooter,1.0,No,Metropolitian,(min) 19
3468,0xd41,MUMRES17DEL02,38,4.4,19.121999,72.908493,19.171999,72.958493,26-03-2022,23:20:00,23:30:00,conditions Windy,Low,0,Buffet,motorcycle,1.0,No,Metropolitian,(min) 24
3515,0xbe36,SURRES07DEL01,25,2.7,21.170798,72.790489,21.240798,72.860489,06-04-2022,22:15:00,22:30:00,conditions Sunny,Low,2,Drinks,motorcycle,1.0,No,Metropolitian,(min) 31


# Clear & Transform

Clearly some columns have NaN values, but they are not appearing on the NaNs check. It can be because they are object types or are stored as string and have some weird formatting like " NaN", with a blank space.<br>First step will be to remove all spaces from string columns, then convert them to the appropiate type and fill with the most appropriate value.

## Strip spaces, fix typos and types

In [6]:
df_clear = df_raw.copy()
# Remove blank spaces from strings.
df_clear = df_clear.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Convert to numeric types where needed.
df_clear["Delivery_person_Age"] = df_clear["Delivery_person_Age"].apply(pd.to_numeric, errors="coerce").astype("Int64")
df_clear["Delivery_person_Ratings"] = df_clear["Delivery_person_Ratings"].apply(pd.to_numeric, errors="coerce")
df_clear["multiple_deliveries"] = df_clear["multiple_deliveries"].apply(pd.to_numeric, errors="coerce").astype("Int64")
# Fix typo on column name
df_clear.rename(columns={'Time_Orderd': 'Time_Ordered'}, inplace=True)
# Convert dates and times from object to date.time
df_clear["Order_Date"] = pd.to_datetime(df_clear["Order_Date"], format="%d-%m-%Y")
df_clear["Time_Ordered"] = pd.to_datetime(df_clear["Time_Ordered"], format="%H:%M:%S")
df_clear["Time_Order_picked"] = pd.to_datetime(df_clear["Time_Order_picked"], format="%H:%M:%S")
# Remove letters from the Column "Time_taken(min)" and change it to Int64
df_clear["Time_taken(min)"] = df_clear["Time_taken(min)"].str.extract(r'(\d+)').astype("Int64")


## Filling NaNs

In [7]:
checkData(df_clear)


Info dos valores:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4467 entries, 0 to 4466
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   ID                           4467 non-null   object        
 1   Delivery_person_ID           4467 non-null   object        
 2   Delivery_person_Age          4278 non-null   Int64         
 3   Delivery_person_Ratings      4272 non-null   float64       
 4   Restaurant_latitude          4467 non-null   float64       
 5   Restaurant_longitude         4467 non-null   float64       
 6   Delivery_location_latitude   4467 non-null   float64       
 7   Delivery_location_longitude  4467 non-null   float64       
 8   Order_Date                   4467 non-null   datetime64[ns]
 9   Time_Ordered                 4288 non-null   datetime64[ns]
 10  Time_Order_picked            4467 non-null   datetime64[ns]
 11  Weatherconditions      

# Feature Enginnereering

To better work with time, "Order_date" will be merged with "Time_Ordered" and "Time_Order_picked", then transformed in a timestamp. For the same reason "Time_taken(min)" will be converted to a timestamp based on "Time_Order_picked" + "Time_taken(min)".

In [8]:
df_work = df_clear.copy()

In [9]:
# .dt.strftime('%d/%m/%Y')

In [10]:
checkNaN(df_clear)

Null values:
 ID                               0
Delivery_person_ID               0
Delivery_person_Age            189
Delivery_person_Ratings        195
Restaurant_latitude              0
Restaurant_longitude             0
Delivery_location_latitude       0
Delivery_location_longitude      0
Order_Date                       0
Time_Ordered                   179
Time_Order_picked                0
Weatherconditions                0
Road_traffic_density             0
Vehicle_condition                0
Type_of_order                    0
Type_of_vehicle                  0
multiple_deliveries            104
Festival                         0
City                             1
Time_taken(min)                  1
dtype: int64
Non-null values:
 ID                             4467
Delivery_person_ID             4467
Delivery_person_Age            4278
Delivery_person_Ratings        4272
Restaurant_latitude            4467
Restaurant_longitude           4467
Delivery_location_latitude     4467
Del

In [11]:
df_clear.tail(5)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Ordered,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
4462,0x2226,SURRES16DEL03,34.0,4.8,21.160437,72.774209,21.220437,72.834209,2022-04-01,1900-01-01 17:25:00,1900-01-01 17:30:00,conditions Cloudy,Medium,1,Meal,motorcycle,1,No,Metropolitian,17.0
4463,0xdf39,BHPRES04DEL02,37.0,4.8,23.184992,77.417227,23.264992,77.497227,2022-02-16,1900-01-01 22:30:00,1900-01-01 22:40:00,conditions Sandstorms,Low,2,Meal,electric_scooter,1,No,Urban,26.0
4464,0xd0c7,KOLRES11DEL01,,,22.577821,88.400581,22.617821,88.440581,2022-02-15,NaT,1900-01-01 15:30:00,conditions Fog,Medium,2,Drinks,motorcycle,1,No,Metropolitian,18.0
4465,0x363d,INDORES13DEL03,34.0,4.6,22.745049,75.892471,22.835049,75.982471,2022-03-23,1900-01-01 22:00:00,1900-01-01 22:15:00,conditions Sunny,Jam,1,Drinks,motorcycle,1,No,Urban,22.0
4466,0xdfc5,KNPRES03DEL03,33.0,4.4,26.469003,80.316344,26.529003,80.376344,2022-02-17,1900-01-01 20:40:00,1900-01-01 20:55:00,conditions Sandstorms,Jam,0,Snack,motorcycle,1,No,,


In [12]:
df_raw.tail(5)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
4462,0x2226,SURRES16DEL03,34.0,4.8,21.160437,72.774209,21.220437,72.834209,01-04-2022,17:25:00,17:30:00,conditions Cloudy,Medium,1,Meal,motorcycle,1,No,Metropolitian,(min) 17
4463,0xdf39,BHPRES04DEL02,37.0,4.8,23.184992,77.417227,23.264992,77.497227,16-02-2022,22:30:00,22:40:00,conditions Sandstorms,Low,2,Meal,electric_scooter,1,No,Urban,(min) 26
4464,0xd0c7,KOLRES11DEL01,,,22.577821,88.400581,22.617821,88.440581,15-02-2022,,15:30:00,conditions Fog,Medium,2,Drinks,motorcycle,1,No,Metropolitian,(min) 18
4465,0x363d,INDORES13DEL03,34.0,4.6,22.745049,75.892471,22.835049,75.982471,23-03-2022,22:00:00,22:15:00,conditions Sunny,Jam,1,Drinks,motorcycle,1,No,Urban,(min) 22
4466,0xdfc5,KNPRES03DEL03,33.0,4.4,26.469003,80.316344,26.529003,80.376344,17-02-2022,20:40:00,20:55:00,conditions Sandstorms,Jam,0,Snack,motorcycle,1,No,,
