In [30]:
import pandas as pd
from utils import identify_rush_hour

In [31]:
bookings_df = pd.read_csv("./dataset/bookings.csv")
customers_df = pd.read_csv("./dataset/customers.csv")
drivers_df = pd.read_csv("./dataset/drivers.csv")
location_demand_df = pd.read_csv("./dataset/location_demand.csv")
time_features_df = pd.read_csv("./dataset/time_features.csv")

In [32]:
pd.set_option('display.max_columns', None)

<h1><b>Handling null values in bookings_df</b></h1>

In [33]:
mask_not_completed = (bookings_df["booking_status"] != "Completed") & (bookings_df["incomplete_ride_reason"].isna())

mask_completed = (bookings_df["booking_status"] == "Completed") & (bookings_df["incomplete_ride_reason"].isna())

mask_actual_ride_time_min = (bookings_df["booking_status"] != "Completed") & bookings_df["actual_ride_time_min"].isna()

bookings_df.loc[mask_not_completed, "incomplete_ride_reason"] = "Unknown"
bookings_df.loc[mask_completed, "incomplete_ride_reason"] = "Not Applicable"

bookings_df.loc[mask_actual_ride_time_min, "actual_ride_time_min"] = -1

bookings_df.isna().sum()


booking_id                 0
booking_date               0
booking_time               0
day_of_week                0
is_weekend                 0
hour_of_day                0
city                       0
pickup_location            0
drop_location              0
vehicle_type               0
ride_distance_km           0
estimated_ride_time_min    0
actual_ride_time_min       0
traffic_level              0
weather_condition          0
base_fare                  0
surge_multiplier           0
booking_value              0
booking_status             0
incomplete_ride_reason     0
customer_id                0
driver_id                  0
dtype: int64

In [34]:
# Combining separate date and time columns into a single datetime column for correct time-based analysis
bookings_df["booking_datetime"] = pd.to_datetime(bookings_df["booking_date"] + " " + bookings_df["booking_time"])

bookings_df = bookings_df.drop(columns=["booking_date", "booking_time"])

<h1><b>Feature Engineering<b></h1>

In [35]:
bookings_df["fare_per_km"] = round(bookings_df["booking_value"] / bookings_df["ride_distance_km"],2)

In [36]:
bookings_df["fare_per_min"] = round(bookings_df["booking_value"] / bookings_df["actual_ride_time_min"],2)
bookings_df.loc[bookings_df["actual_ride_time_min"] <= 0, "fare_per_min"] = 0

In [37]:
bookings_df["rush_hour_flag"] = bookings_df.apply(identify_rush_hour,axis=1)
bookings_df["rush_hour_flag"].value_counts()

rush_hour_flag
0    94263
1     5737
Name: count, dtype: int64

In [38]:
threshold = bookings_df["ride_distance_km"].mean() + bookings_df["ride_distance_km"].std()

bookings_df["long_distance_flag"] = (
    (bookings_df["ride_distance_km"] >= threshold) &
    (bookings_df["booking_status"] == "Completed")
).astype(int)

bookings_df["long_distance_flag"].value_counts()


long_distance_flag
0    85484
1    14516
Name: count, dtype: int64