# Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set Globals & Functions

In [2]:
def checkData(x):
    print(f"\nInfo dos valores:\n")
    print(x.info())
    print(f"\nDescrição dos dados:\n\n{x.describe()}")
    return

def checkNaN(x):
    """This function receives a DataFrame and returns the count of NaN values for each column."""
    null_count = x.isna().sum()
    non_null_count = x.notnull().sum()
    print(f"Null values:\n {null_count}")
    print(f"Non-null values:\n {non_null_count}")
    return

def checkOutliers(x):
    """This functions receives a DataFrame and returns a DataFrame with the outliers.
    It takes as premise that the data is normally distributed."""
    dfOutliers  = pd.DataFrame()
    # Itera sobre as colunas númericas
    for column in x.select_dtypes(include=[np.number]).columns:
        # Calcula o 1 quartil (Q1)
        Q1 = x[column].quantile(0.25)
        # Calcula o 3 quartil (Q3)
        Q3 = x[column].quantile(0.75)
        # Calcula o Intervalo Interquartil (IQR)
        IQR = Q3 - Q1
        # Define os limites inferior e superior para outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        # Identica os outliers da coluna atual e adiciona ao DataFrame de outliers
        outliers = x[(x[column] < lower_bound) | (x[column] > upper_bound)]
        # Concatene os outliers da coluna atual ao DataFrame de outliers
        dfOutliers = pd.concat([dfOutliers, outliers])
    outliersData = dfOutliers.drop_duplicates().reset_index(drop=True)
    return outliersData

def corr(x):
    plt.figure(figsize=(10,8))
    sns.heatmap(x.select_dtypes(include=[np.number]).corr(), annot=True)
    plt.show()
    return

def hist(x):
    """This function receives a DataFrame and returns a histogram of the data."""
    x.hist(bins=50, figsize=(25,10))
    plt.show()
    return




# Data Loading & Check

In [3]:
df_raw = pd.read_csv("train.csv")

In [4]:
checkData(df_raw)


Info dos valores:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4467 entries, 0 to 4466
Data columns (total 20 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           4467 non-null   object 
 1   Delivery_person_ID           4467 non-null   object 
 2   Delivery_person_Age          4467 non-null   object 
 3   Delivery_person_Ratings      4467 non-null   object 
 4   Restaurant_latitude          4467 non-null   float64
 5   Restaurant_longitude         4467 non-null   float64
 6   Delivery_location_latitude   4467 non-null   float64
 7   Delivery_location_longitude  4467 non-null   float64
 8   Order_Date                   4467 non-null   object 
 9   Time_Orderd                  4467 non-null   object 
 10  Time_Order_picked            4467 non-null   object 
 11  Weatherconditions            4467 non-null   object 
 12  Road_traffic_density         4467 non-null   object 
 13

In [5]:
df_raw.sample(30)

Unnamed: 0,ID,Delivery_person_ID,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Order_Date,Time_Orderd,Time_Order_picked,Weatherconditions,Road_traffic_density,Vehicle_condition,Type_of_order,Type_of_vehicle,multiple_deliveries,Festival,City,Time_taken(min)
4080,0x7e2f,RANCHIRES17DEL01,31.0,5.0,23.374878,85.335739,23.384878,85.345739,09-03-2022,10:45:00,10:55:00,conditions Sunny,Low,1,Drinks,scooter,1,No,Urban,(min) 13
1816,0xa034,SURRES13DEL03,28.0,4.8,21.170096,72.789122,21.200096,72.819122,28-03-2022,22:45:00,22:55:00,conditions Sandstorms,Low,2,Snack,electric_scooter,1,No,Metropolitian,(min) 17
1922,0xc92f,BHPRES18DEL01,26.0,4.6,23.234631,77.401663,23.304631,77.471663,18-02-2022,21:55:00,22:00:00,conditions Sandstorms,Jam,2,Meal,electric_scooter,0,No,Urban,(min) 15
2349,0x922,SURRES13DEL02,,,21.170096,72.789122,21.300096,72.919122,23-03-2022,,20:40:00,conditions NaN,,0,Buffet,motorcycle,0,No,Metropolitian,(min) 39
2767,0xb890,VADRES02DEL01,25.0,5.0,0.0,0.0,0.11,0.11,16-03-2022,19:00:00,19:05:00,conditions Fog,Medium,0,Drinks,motorcycle,0,No,Metropolitian,(min) 30
476,0xcc11,AURGRES05DEL02,38.0,5.0,19.875522,75.367127,19.925522,75.417127,15-02-2022,21:20:00,21:35:00,conditions Sandstorms,Jam,1,Meal,scooter,1,No,Metropolitian,(min) 34
1496,0xa17a,HYDRES11DEL03,20.0,4.3,17.430448,78.418213,17.520448,78.508213,14-03-2022,21:30:00,21:45:00,conditions Cloudy,Jam,1,Drinks,scooter,1,No,Metropolitian,(min) 35
624,0x8d48,JAPRES18DEL03,36.0,4.8,26.913987,75.752891,26.943987,75.782891,13-03-2022,23:10:00,23:20:00,conditions Cloudy,Low,0,Meal,motorcycle,1,No,Urban,(min) 20
2825,0xc59a,LUDHRES12DEL03,26.0,4.6,30.893244,75.821817,30.983244,75.911817,14-02-2022,17:35:00,17:45:00,conditions Sunny,Medium,0,Meal,motorcycle,1,No,Metropolitian,(min) 29
1249,0x158c,HYDRES09DEL01,34.0,4.7,17.431668,78.408321,17.441668,78.418321,26-03-2022,11:55:00,12:10:00,conditions Stormy,High,0,Meal,motorcycle,3,No,Metropolitian,(min) 44


# Clear & Transform

Clearly some columns have NaN values, but they are not appearing on the NaNs check. It can be because they are object types or are stored as string and have some weird formatting like " NaN", with a blank space.<br>First step is to remove all spaces from string columns, then convert them to the appropiate type and fill with the most appropriate value.

In [None]:
df_clear = df_raw
# Remove blank spaces from strings.
df_clear = df_clear.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# Convert to numeric types where needed.
df_clear["Delivery_person_Age"] = df_clear["Delivery_person_Age"].apply(pd.to_numeric, errors="coerce").astype("Int64")
df_clear["Delivery_person_Ratings"] = df_clear["Delivery_person_Ratings"].apply(pd.to_numeric, errors="coerce")
df_clear["multiple_deliveries"] = df_clear["multiple_deliveries"].apply(pd.to_numeric, errors="coerce").astype("Int64")
# Convert dates and times from object to date.time
df_clear["Order_Date"] = pd.to_datetime(df_clear["Order_Date"], format="%d-%m-%Y")


In [17]:
checkNaN(df_clear)

Null values:
 ID                               0
Delivery_person_ID               0
Delivery_person_Age            189
Delivery_person_Ratings        195
Restaurant_latitude              0
Restaurant_longitude             0
Delivery_location_latitude       0
Delivery_location_longitude      0
Order_Date                       0
Time_Orderd                      0
Time_Order_picked                0
Weatherconditions                0
Road_traffic_density             0
Vehicle_condition                0
Type_of_order                    0
Type_of_vehicle                  0
multiple_deliveries            104
Festival                         0
City                             1
Time_taken(min)                  1
dtype: int64
Non-null values:
 ID                             4467
Delivery_person_ID             4467
Delivery_person_Age            4278
Delivery_person_Ratings        4272
Restaurant_latitude            4467
Restaurant_longitude           4467
Delivery_location_latitude     4467
Del

In [18]:
df_clear.dtypes

ID                                     object
Delivery_person_ID                     object
Delivery_person_Age                     Int64
Delivery_person_Ratings               float64
Restaurant_latitude                   float64
Restaurant_longitude                  float64
Delivery_location_latitude            float64
Delivery_location_longitude           float64
Order_Date                     datetime64[ns]
Time_Orderd                            object
Time_Order_picked                      object
Weatherconditions                      object
Road_traffic_density                   object
Vehicle_condition                       int64
Type_of_order                          object
Type_of_vehicle                        object
multiple_deliveries                     Int64
Festival                               object
City                                   object
Time_taken(min)                        object
dtype: object