In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.read_csv("./dataset/final_dataset.csv")
df

In [None]:
df.info()

In [None]:
df["driver_delay_flag"].value_counts(), df["driver_id"].duplicated().sum()

In [None]:
df = df.drop_duplicates("driver_id",keep="first")
df

<h1><b>Splitting train-test data to prevent data leakage</b></h1>

In [None]:
y = df["driver_delay_flag"]
x = df.drop(columns=["driver_delay_flag", "booking_id", "booking_date", "booking_time", "actual_ride_time_min", "incomplete_ride_reason", "customer_id", "driver_id", "delay_rate", "delay_count", "booking_status", "fare_per_min"])

In [None]:
x.columns

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train

<h1><b>Encoding categorical features</b></h1>

In [None]:
from sklearn.preprocessing import OrdinalEncoder
import joblib

string_columns = x_train.select_dtypes(include='object').columns

ordinal_enc = OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)

x_train[string_columns] = ordinal_enc.fit_transform(x_train[string_columns])
x_test[string_columns]  = ordinal_enc.transform(x_test[string_columns])

joblib.dump(ordinal_enc, "./encoders/driver_delay_model_ordinal_encoder.joblib")

x_train.isna().sum()

<h1><b>Detecting and treating outliers</b></h1>

In [None]:
from utils import cap_outliers_iqr

continuous_cols = [
    "ride_distance_km",
    "estimated_ride_time_min",
    "base_fare",
    "surge_multiplier",
    "booking_value",
    "fare_per_km",
    "driver_age",
    "driver_experience_years",
    "accepted_rides",
    "location_completed_rides",
    "avg_surge_multiplier",
    "location_cancelled_rides",
    "avg_wait_time_min",
    "customer_age",
    "customer_signup_days_ago",
    "customer_total_bookings",
    "customer_completed_rides",
    "customer_cancelled_rides",
    "customer_incomplete_rides",
    "customer_cancellation_rate",
    "avg_customer_rating",
    "customer_cancel_flag",
    "avg_pickup_delay_min",
    "driver_incomplete_rides",
    "acceptance_rate",
    "avg_driver_rating",
    "total_assigned_rides",
    "total_requests"
]
x_train = cap_outliers_iqr(x_train, continuous_cols)
x_test  = cap_outliers_iqr(x_test, continuous_cols)

In [None]:
df

<h1><b>Selecting relevant features</b></h1>

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

corr_matrix=x_train.corr()

plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")

In [None]:
threshold = 0.8
corr = x_train.corr().abs()

upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))

to_drop = [col for col in upper.columns if any(upper[col] > threshold)]

x_train_sel = x_train.drop(columns=to_drop)
x_test_sel = x_test.drop(columns=to_drop)

print(*to_drop)

In [None]:
x_train_sel.columns

In [None]:
from imblearn.over_sampling import SMOTE
x_train_smote, y_train_smote = SMOTE().fit_resample(x_train_sel, y_train)

from imblearn.over_sampling import RandomOverSampler
x_train_over, y_train_over = RandomOverSampler().fit_resample(x_train_sel, y_train)

from imblearn.under_sampling import RandomUnderSampler
x_train_under , y_train_under = RandomUnderSampler().fit_resample(x_train_sel,y_train)

In [None]:
y_train.value_counts(), y_train_smote.value_counts(), y_train_over.value_counts(), y_train_under.value_counts()

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)

class_weight_dict = dict(zip(classes, class_weights))

sample_weight_train = np.array([class_weight_dict[y] for y in y_train])
sample_weight_train

<h1><b>Selecting baseline Model</b></h1>

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

xgb_model.fit(x_train_sel, y_train)
pred = xgb_model.predict(x_test_sel)
print(classification_report(y_test, pred))
confusion_matrix(y_test, pred)

In [None]:
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

xgb_model = XGBClassifier()
# class_weight_model = XGBClassifier(objective="multi:softprob", class_weight=class_weight_dict)

class_weight_model = xgb_model.fit(x_train_sel,y_train,sample_weight=sample_weight_train)
pred = class_weight_model.predict(x_test_sel)
print("----------class weight-----------")
print(classification_report(y_test,pred))

smote_model = xgb_model.fit(x_train_smote, y_train_smote)
pred = xgb_model.predict(x_test_sel)
print("----------Smote-----------")
print(classification_report(y_test, pred))

under_model = xgb_model.fit(x_train_under, y_train_under)
pred = xgb_model.predict(x_test_sel)
print("----------Under sampling-----------")
print(classification_report(y_test, pred))

over_model = xgb_model.fit(x_train_over, y_train_over)
pred = xgb_model.predict(x_test_sel)
print("----------over sampling-----------")
print(classification_report(y_test, pred))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from xgboost import XGBClassifier
import numpy as np

models = {
    "xgb_classifier": XGBClassifier(
        eval_metric="logloss",
        use_label_encoder=False
    ),
    "gradient_boosting": GradientBoostingClassifier(),
    "random_forest": RandomForestClassifier(),
    "ada_boost": AdaBoostClassifier(),
    "decision_tree": DecisionTreeClassifier()
}

balance_concepts = ["no_balancing", "smote", "oversampling", "undersampling", "class_weight"]
results = []

for name, base_model in models.items():
    for b in balance_concepts:
        model = clone(base_model)
        if b == "no_balancing":
            model.fit(x_train_sel, y_train)
        elif b == "smote":
            model.fit(x_train_smote, y_train_smote)
        elif b == "oversampling":
            model.fit(x_train_over, y_train_over)
        elif b == "undersampling":
            model.fit(x_train_under, y_train_under)
        elif b == "class_weight":
            model.fit(x_train_sel,y_train,sample_weight=sample_weight_train)

        pred = model.predict(x_test_sel)

        results.append({
            "model": name,
            "balancing": b,
            "report": classification_report(y_test, pred),
            "cm": confusion_matrix(y_test, pred)
        })

# print results
for r in results:
    print("************************************************")
    print(f"Model Name: {r['model']}")
    print(f"Balancing Method: {r['balancing']}")
    print(r["report"])
    print(r["cm"])
    print("")


<h1><b>Hyper Parameter tuning</b></h1>

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),

        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
        "n_jobs": -1,
    }

    model = XGBClassifier(**params)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    return cross_val_score(
        model,
        x_train_sel,
        y_train,
        cv=cv,
        scoring="f1_macro",
        error_score="raise"
    ).mean()


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

print("Best params:", study.best_params)
print("Best CV score:", study.best_value)


In [None]:
best_model = XGBClassifier(
    **study.best_params,
    objective = "binary:logistic",
    eval_metric = "logloss",
    random_state=42,
    n_jobs=-1
)

best_model.fit(x_train_sel, y_train)
y_pred = best_model.predict(x_test_sel)
print(classification_report(y_test, y_pred))
confusion_matrix(y_test,y_pred)


In [None]:
import pickle
with open("./models/driver_delay_prediction_model.pkl", "wb") as f:
    pickle.dump(best_model, f)