Project Name: GrabRide Fare-Time Cost Optimization

Project Type: Raw Data Collection and Predictive Modeling 

Author: Melvin Tai (Last Updated: December 27th, 2025)

Let's call these points "A" and "B", respectively

In [136]:
# Importing libraries.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [138]:
# Reading the dataset.
df = pd.read_csv("GrabRide_Data.csv")
df.info()
df.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   date               143 non-null    object 
 1   time               143 non-null    object 
 2   fare_standard_RM   143 non-null    int64  
 3   driver_est_min     143 non-null    int64  
 4   distance_km        143 non-null    float64
 5   duration_min       143 non-null    int64  
 6   fastest_route      143 non-null    object 
 7   weather            143 non-null    object 
 8   imap_distance_km   96 non-null     float64
 9   imap_duration_min  96 non-null     float64
 10  tolls_true_false   96 non-null     float64
dtypes: float64(4), int64(3), object(4)
memory usage: 12.4+ KB


Unnamed: 0,date,time,fare_standard_RM,driver_est_min,distance_km,duration_min,fastest_route,weather,imap_distance_km,imap_duration_min,tolls_true_false
count,143,143,143.0,143.0,143.0,143.0,143,143,96.0,96.0,96.0
unique,8,19,,,,,3,2,,,
top,11/24/2025,7:15,,,,,LSA_E5,Clear,,,
freq,19,8,,,,,123,106,,,
mean,,,36.594406,7.559441,20.824476,26.195804,,,19.197917,33.78125,0.46875
std,,,14.1999,1.225448,1.76061,6.206833,,,1.917344,4.160473,0.501642
min,,,21.0,4.0,15.5,23.0,,,17.0,25.0,0.0
25%,,,24.0,7.0,21.5,24.0,,,17.0,31.0,0.0
50%,,,30.0,7.0,21.5,24.0,,,19.0,33.0,0.0
75%,,,50.0,8.0,21.5,24.0,,,20.0,37.0,1.0


In [146]:
# Formatting and creating new columns.
df["datetime"] = pd.to_datetime(df["date"] + " " + df["time"], format = "%m/%d/%Y %H:%M")
df["minute_of_the_day"] = df["datetime"].dt.hour * 60 + df["datetime"].dt.minute
df["day_of_the_week"] = df["datetime"].dt.weekday
df.head()

Unnamed: 0,date,time,fare_standard_RM,driver_est_min,distance_km,duration_min,fastest_route,weather,imap_distance_km,imap_duration_min,tolls_true_false,datetime,minute_of_the_day,day_of_the_week
0,11/24/2025,6:30,48,8,16.8,24,LBP_E10,Rainy,,,,2025-11-24 06:30:00,390,0
1,11/24/2025,6:35,50,7,16.8,26,LBP_E10,Rainy,,,,2025-11-24 06:35:00,395,0
2,11/24/2025,6:40,51,9,16.8,28,LBP_E10,Rainy,,,,2025-11-24 06:40:00,400,0
3,11/24/2025,6:45,51,8,16.8,31,LBP_E10,Rainy,,,,2025-11-24 06:45:00,405,0
4,11/24/2025,6:50,51,9,16.8,32,LBP_E10,Rainy,,,,2025-11-24 06:50:00,410,0


In [148]:
# Creating dummy columns.
df_encoded = pd.get_dummies(df, columns = ["fastest_route", "weather"], drop_first = True)
df_encoded.head()

Unnamed: 0,date,time,fare_standard_RM,driver_est_min,distance_km,duration_min,imap_distance_km,imap_duration_min,tolls_true_false,datetime,minute_of_the_day,day_of_the_week,fastest_route_LP_Route_2,fastest_route_LSA_E5,weather_Rainy
0,11/24/2025,6:30,48,8,16.8,24,,,,2025-11-24 06:30:00,390,0,False,False,True
1,11/24/2025,6:35,50,7,16.8,26,,,,2025-11-24 06:35:00,395,0,False,False,True
2,11/24/2025,6:40,51,9,16.8,28,,,,2025-11-24 06:40:00,400,0,False,False,True
3,11/24/2025,6:45,51,8,16.8,31,,,,2025-11-24 06:45:00,405,0,False,False,True
4,11/24/2025,6:50,51,9,16.8,32,,,,2025-11-24 06:50:00,410,0,False,False,True


In [150]:
# Transforming columns for the convenience of machine learning.
TARGET = "fare_standard_RM"
FEATURES = [
    "minute_of_the_day",
    "day_of_the_week",
    "driver_est_min",
    "distance_km",
    "duration_min",
    "imap_distance_km",
    "imap_duration_min",
    "tolls_true_false"
]

FEATURES += [c for c in df_encoded.columns if c.startswith("fastest_route_")]
FEATURES += [c for c in df_encoded.columns if c.startswith("weather_")]

X = df_encoded[FEATURES]
y = df_encoded[TARGET]

In [152]:
# Constructing the Random Forest Regression model.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
fare_model = RandomForestRegressor(n_estimators = 300, max_depth = 12, min_samples_leaf = 3, random_state = 42)
fare_model.fit(X_train, y_train)

In [154]:
# Calculating the Mean Absolute Error.
mae = mean_absolute_error(y_test, fare_model.predict(X_test))
print(f"Fare Prediction MAE: RM {mae:.2f}")

Fare Prediction MAE: RM 6.32


In [160]:
# Constructing the price_label function.
low = y.quantile(0.33)
high = y.quantile(0.67)
def price_label(price):
    if price <= low:
        return "Cheap"
    elif price <= high:
        return "Average"
    return "Pricey"

In [162]:
# Constructing the predict_for_time function.
def predict_for_time(input_date, input_time):
    dt = pd.to_datetime(f"{input_date} {input_time}")
    minute_of_day = dt.hour * 60 + dt.minute
    day_of_week = dt.weekday()
    base_row = X.median().to_dict()
    
    base_row["minute_of_the_day"] = minute_of_day
    base_row["day_of_the_week"] = day_of_week

    input_df = pd.DataFrame([base_row])
    predicted_fare = fare_model.predict(input_df)[0]
    return {"predicted_price_RM": round(predicted_fare, 2), "price_label": price_label(predicted_fare)}

In [166]:
# Constructing the recommend_booking_time function.
def recommend_booking_time(input_date, current_time, latest_arrival = "08:30"):
    start_time = pd.to_datetime(f"{input_date} {current_time}")
    end_time = pd.to_datetime(f"{input_date} 08:00")
    
    if start_time > end_time:
        return {"message": "No valid recommendation window remaining. Please select a time manually."}

    times = pd.date_range(start = start_time, end = end_time, freq = "5min")
    
    predictions = []
    for t in times:
        result = predict_for_time(input_date, t.strftime("%H:%M"))
        predictions.append({"time": t.strftime("%H:%M"), "price": result["predicted_price_RM"]})

    df_preds = pd.DataFrame(predictions)
    best = df_preds.loc[df_preds["price"].idxmin()]
    return {"recommended_time": best["time"], "expected_price_RM": best["price"], "all_predictions": df_preds}

In [168]:
# Testing our work.
today = "2025-12-04"
time = "06:45"
current_prediction = predict_for_time(today, time)
recommendation = recommend_booking_time(today, time)

print(f"Predicted Price at {time}am on {today} (RM): {current_prediction["predicted_price_RM"]} +- {mae:.2f}")
print(f"Predicted Price Label (Cheap/Average/Pricey): {current_prediction["price_label"]}")
print(f"Recommended Booking Time Today: {recommendation["recommended_time"]}am\n")

print(current_prediction, recommendation["recommended_time"] + "\n")

recommendation["all_predictions"]

Predicted Price at 06:45am on 2025-12-04 (RM): 24.44 +- 6.32
Predicted Price Label (Cheap/Average/Pricey): Cheap
Recommended Booking Time Today: 06:45am

{'predicted_price_RM': 24.44, 'price_label': 'Cheap'} 06:45



Unnamed: 0,time,price
0,06:45,24.44
1,06:50,24.63
2,06:55,25.23
3,07:00,26.41
4,07:05,26.14
5,07:10,26.04
6,07:15,26.24
7,07:20,26.45
8,07:25,26.89
9,07:30,26.44
