# Filling NULL Values of Categorical Columns with New Category as "NONE", fixed time and ratings

In [1]:
import numpy as np
import pandas as pd

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
#reading data
df = pd.read_csv('../dataset/data.csv')

In [3]:
df.shape

(45593, 21)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45593 non-null  object 
 1   Delivery_person_ID           45593 non-null  object 
 2   Delivery_person_Age          43739 non-null  float64
 3   Delivery_person_Ratings      43685 non-null  float64
 4   Restaurant_latitude          45593 non-null  float64
 5   Restaurant_longitude         45593 non-null  float64
 6   Delivery_location_latitude   45593 non-null  float64
 7   Delivery_location_longitude  45593 non-null  float64
 8   Order_Date                   45593 non-null  object 
 9   Time_Orderd                  43862 non-null  object 
 10  Time_Order_picked            45593 non-null  object 
 11  Weather                      44977 non-null  object 
 12  Road_traffic_density         44992 non-null  object 
 13  Vehicle_conditio

### Fixing Type

In [5]:
#Changing type of Vehicle condition to int
df['Vehicle_condition'] = df['Vehicle_condition'].astype(np.int8)

### Correcting Time

In [6]:
def fix_time(x):
    """Function to fix time
    x is a string of time-> hh:mm or h:mm"""
    if type(x)==float:
        return np.nan
    
    if x=='24:00' or x=='23:60':
        return "0:0"
    
    if x[:2]=="24":
        return "00:"+x[3:]
    
    if x[-2]=='6':
        if x[1]==':':
            return str(int(x[0])+1)+":0"+x[-1]
        return str((int(x[:2])+1)%24)+":0"+x[-1]
    return x

In [7]:
#correcting time
df['Time_Orderd'] = df['Time_Orderd'].apply(fix_time)
df['Time_Order_picked'] = df['Time_Order_picked'].apply(fix_time)

### Fixing Ratings

In [8]:
def ratings(x):
    """Function to fix ratings between 1 to 5"""
    if x<1:
        return 1
    elif x>5:
        return 5
    return x

In [9]:
#fixing ratings
df['Delivery_person_Ratings'] = df['Delivery_person_Ratings'].apply(ratings)

In [10]:
def check_null(df):
    '''Returns % null values in each column'''
    return round(100*df.isnull().sum()/len(df), 2).sort_values(ascending=False)

In [11]:
check_null(df)

Delivery_person_Ratings        4.18
Delivery_person_Age            4.07
Time_Orderd                    3.80
City                           2.63
multiple_deliveries            2.18
Weather                        1.35
Road_traffic_density           1.32
Festival                       0.50
ID                             0.00
Vehicle_condition              0.00
Time_taken                     0.00
Type_of_vehicle                0.00
Type_of_order                  0.00
Time_Order_picked              0.00
Delivery_person_ID             0.00
Order_Date                     0.00
Delivery_location_longitude    0.00
Delivery_location_latitude     0.00
Restaurant_longitude           0.00
Restaurant_latitude            0.00
kfold                          0.00
dtype: float64

In [12]:
#Multiple deliveries -> Number of orders delivered in one attempt
#since 0 orders is not a possible case, may be it indicates 1 order in one attempt.
#Also, There might be a possibility that 2 order in one attempt are more likely then any other case.
df['multiple_deliveries'].value_counts()

1.0    28159
0.0    14095
2.0     1985
3.0      361
Name: multiple_deliveries, dtype: int64

In [13]:
df['multiple_deliveries'] = df['multiple_deliveries'].apply(lambda x: x+1)

In [14]:
df['multiple_deliveries'].value_counts()

2.0    28159
1.0    14095
3.0     1985
4.0      361
Name: multiple_deliveries, dtype: int64

In [15]:
check_null(df)

Delivery_person_Ratings        4.18
Delivery_person_Age            4.07
Time_Orderd                    3.80
City                           2.63
multiple_deliveries            2.18
Weather                        1.35
Road_traffic_density           1.32
Festival                       0.50
ID                             0.00
Vehicle_condition              0.00
Time_taken                     0.00
Type_of_vehicle                0.00
Type_of_order                  0.00
Time_Order_picked              0.00
Delivery_person_ID             0.00
Order_Date                     0.00
Delivery_location_longitude    0.00
Delivery_location_latitude     0.00
Restaurant_longitude           0.00
Restaurant_latitude            0.00
kfold                          0.00
dtype: float64

In [16]:
cat_cols = ['City', 'Weather', 'Road_traffic_density', 'Time_Orderd', "Festival"]

In [17]:
#imputing Nan as "NONE" for categorical variables
df[cat_cols] = df[cat_cols].fillna("NONE")

In [18]:
check_null(df)

Delivery_person_Ratings        4.18
Delivery_person_Age            4.07
multiple_deliveries            2.18
ID                             0.00
Road_traffic_density           0.00
Time_taken                     0.00
City                           0.00
Festival                       0.00
Type_of_vehicle                0.00
Type_of_order                  0.00
Vehicle_condition              0.00
Time_Order_picked              0.00
Weather                        0.00
Delivery_person_ID             0.00
Time_Orderd                    0.00
Order_Date                     0.00
Delivery_location_longitude    0.00
Delivery_location_latitude     0.00
Restaurant_longitude           0.00
Restaurant_latitude            0.00
kfold                          0.00
dtype: float64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45593 entries, 0 to 45592
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   ID                           45593 non-null  object 
 1   Delivery_person_ID           45593 non-null  object 
 2   Delivery_person_Age          43739 non-null  float64
 3   Delivery_person_Ratings      43685 non-null  float64
 4   Restaurant_latitude          45593 non-null  float64
 5   Restaurant_longitude         45593 non-null  float64
 6   Delivery_location_latitude   45593 non-null  float64
 7   Delivery_location_longitude  45593 non-null  float64
 8   Order_Date                   45593 non-null  object 
 9   Time_Orderd                  45593 non-null  object 
 10  Time_Order_picked            45593 non-null  object 
 11  Weather                      45593 non-null  object 
 12  Road_traffic_density         45593 non-null  object 
 13  Vehicle_conditio

In [20]:
df['ID'].nunique()

45593

In [21]:
#Dropping ID as it contains only unique values
df = df.drop(['ID'], axis=1)

In [22]:
df.to_csv("../dataset/cat_data.csv", index=False)