In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.read_csv("./dataset/final_dataset.csv")
df

In [None]:
df.columns

<h1><b>Splitting train-test data to prevent data leakage</b></h1>

In [None]:
y = df["base_fare"]
x = df.drop(columns=["base_fare", "booking_id", "booking_date", "booking_time", "actual_ride_time_min", "incomplete_ride_reason", "customer_id", "driver_id", "booking_value", "fare_per_km", "avg_surge_multiplier", "booking_status", "surge_multiplier", "fare_per_min"])

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train

<h1><b>Encoding categorical features</b></h1>

In [None]:
from sklearn.preprocessing import OrdinalEncoder
import joblib

string_columns = x_train.select_dtypes(include='object').columns

ordinal_enc = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)

x_train[string_columns] = ordinal_enc.fit_transform(x_train[string_columns])
x_test[string_columns]  = ordinal_enc.transform(x_test[string_columns])

joblib.dump(ordinal_enc, "./encoders/fare_prediction_ordinal_encoder.joblib")

x_train.isna().sum()

<h1><b>Detecting and treating outliers</b></h1>

In [None]:
from utils import cap_outliers_iqr

continuous_cols = [
    "ride_distance_km",
    "estimated_ride_time_min",
    "customer_age",
    "customer_signup_days_ago",
    "customer_completed_rides",
    "customer_cancellation_rate",
    "driver_age",
    "driver_experience_years",
    "accepted_rides",
    "delay_rate",
    "location_completed_rides",
    "location_cancelled_rides",
    "avg_wait_time_min",
    "total_requests",
    "total_assigned_rides",
    "driver_incomplete_rides",
    "delay_count",
    "acceptance_rate",
    "avg_driver_rating",
    "avg_pickup_delay_min",
    "customer_total_bookings",
    "customer_cancelled_rides",
    "customer_incomplete_rides",
    "avg_customer_rating"
]
x_train = cap_outliers_iqr(x_train, continuous_cols)
x_test  = cap_outliers_iqr(x_test, continuous_cols)

In [None]:
df[["customer_cancel_flag", "driver_delay_flag", "location_completed_rides", "location_cancelled_rides", "avg_wait_time_min", "total_requests", "demand_level", "total_assigned_rides", "driver_incomplete_rides", "delay_count", "acceptance_rate", "avg_driver_rating", "avg_pickup_delay_min", "customer_total_bookings", "customer_cancelled_rides", "customer_incomplete_rides", "avg_customer_rating"]]

<h1><b>Selecting relevant features</b></h1>

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix=x_train.corr()

plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")

In [None]:
threshold = 0.8
corr = x_train.corr().abs()

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

to_drop = [col for col in upper.columns if any(upper[col] > threshold)]

x_train_sel = x_train.drop(columns=to_drop)
x_test_sel = x_test.drop(columns=to_drop)

print(*to_drop)
x_train_sel

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

model = XGBRegressor().fit(x_train_sel, y_train)
pred = model.predict(x_test_sel)

print(mean_absolute_error(y_test, pred))
print(mean_squared_error(y_test, pred))
print(np.sqrt(mean_squared_error(y_test, pred)))
print(r2_score(y_test, pred))

<h1><b>Selecting baseline Model</b></h1>

In [None]:
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
import numpy as np
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    AdaBoostRegressor
)
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)
from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet
)

models = {
    "linear_regression": LinearRegression(),
    "ridge": Ridge(alpha=1.0, random_state=42),
    "lasso": Lasso(alpha=0.01, random_state=42),
    "elastic_net": ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42),

    "decision_tree": DecisionTreeRegressor(random_state=42),
    "random_forest": RandomForestRegressor(
        n_estimators=200,
        random_state=42,
        n_jobs=-1
    ),
    "gradient_boosting": GradientBoostingRegressor(random_state=42),
    "ada_boost": AdaBoostRegressor(random_state=42),
    "xgb_regressor": XGBRegressor(
        objective="reg:squarederror",
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
}
results = {}

for name, model in models.items():
    print(f"Model Start: {name}")
    model.fit(x_train_sel, y_train)
    pred = model.predict(x_test_sel)

    results[name] = {
        "MAE": mean_absolute_error(y_test, pred),
        "MSE": mean_squared_error(y_test, pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, pred)),
        "R2": r2_score(y_test, pred)
    }
    print(f"Model End: {name}")

for name, metrics in results.items():
    print("************************************************")
    print(f"Model Name: {name}")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    print("")


<h1><b>Hyper Parameter tuning</b></h1>

In [None]:
import optuna
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),

        "objective": "reg:squarederror",
        "random_state": 42,
        "n_jobs": -1
    }

    model = XGBRegressor(**params)

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    rmse = -cross_val_score(
        model,
        x_train_sel,
        y_train,
        cv=cv,
        scoring="neg_root_mean_squared_error"
    ).mean()

    return rmse


In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best params:", study.best_params)
print("Best CV RMSE:", study.best_value)


In [None]:
best_model = XGBRegressor(
    **study.best_params,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

best_model.fit(x_train_sel, y_train)
y_pred = best_model.predict(x_test_sel)

print(f"MAE : {mean_absolute_error(y_test, y_pred)}")
print(f"MSE : {mean_squared_error(y_test, y_pred)}")
print(f"Mean sq error root : {np.sqrt(mean_squared_error(y_test, y_pred))}")
print(f"r2 : {r2_score(y_test, y_pred)}")


In [None]:
import numpy as np
import pandas as pd

df = pd.DataFrame({
    "actual": y_test,
    "predicted": y_pred
})

df["abs_error"] = np.abs(df["predicted"] - df["actual"])
df["allowed_error"] = 0.10 * np.abs(df["actual"])
df["error_prc"] = (df["abs_error"] / df["actual"]) * 100
df

In [None]:
df[df["error_prc"]>10]