## dataset cleanup

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv("../data/orders.csv")
RANDOM_SEED=42

# breakdown order date
df["order_date"] = pd.to_datetime(df["order_date"])
df['order_month'] = df['order_date'].dt.month
df['order_day_of_week'] = df['order_date'].dt.dayofweek
df['is_weekend'] = (df['order_date'].dt.dayofweek >= 5).astype(int)
# for stay duration
df["checkin_date"] = pd.to_datetime(df["checkin_date"])
df["checkout_date"] = pd.to_datetime(df["checkout_date"])
# check if cancelled
df['is_cancelled'] = (df['cancel_date'] != "1900-01-01 00:00:00").astype(int)
# 
df['customer_order_count'] = df.groupby('email')['email'].transform('count')
df["lead_time"] = (df["checkin_date"] - df["order_date"]).dt.days
df["stay_duration"] = (df["checkout_date"] - df["checkin_date"]).dt.days
df["price_per_room"] = df["total_price"] / (df["room_qty"]*df["stay_duration"])
df["deposit_ratio"] = df["prepaid"] / df["total_price"]
df["deposit_ratio"] = df["deposit_ratio"].clip(0, 1)
df["group_size"] = df["room_qty"]
# cleanup
# email was dropped here, it was replaced by customer_order_count
df.drop('reservation_id', axis=1, inplace=True)
df.drop('reservation_no', axis=1, inplace=True)
df.drop('email', axis=1, inplace=True)
df.drop('order_date', axis=1, inplace=True)
df.drop('checkin_date', axis=1, inplace=True)
df.drop('checkout_date', axis=1, inplace=True)
df.drop('cancel_date', axis=1, inplace=True)
df.drop('room_qty', axis=1, inplace=True)

df.columns

Index(['brand_id', 'hotel_id', 'total_price', 'prepaid', 'payment_type',
       'order_month', 'order_day_of_week', 'is_weekend', 'is_cancelled',
       'customer_order_count', 'lead_time', 'stay_duration', 'price_per_room',
       'deposit_ratio', 'group_size'],
      dtype='object')

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    RobustScaler, 
    OneHotEncoder, 
    FunctionTransformer, 
    TargetEncoder
)
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

log_transformer = FunctionTransformer(np.log1p)

preprocessor = ColumnTransformer(
    transformers=[
        ('num_log', log_transformer, ['price_per_room', 'customer_order_count']),
        ('num_scale', RobustScaler(), ['lead_time', 'deposit_ratio', 'stay_duration']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['payment_type', 'brand_id']),
        ('hotel_target', TargetEncoder(target_type='binary'), ['hotel_id'])
    ])

In [3]:
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [4]:
X = df.drop('is_cancelled', axis=1)
y = df['is_cancelled']

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=RANDOM_SEED, stratify=y_train_full
)

print(f"Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

Train: 32166 | Val: 10723 | Test: 10723


In [5]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100, max_depth=10, random_state=RANDOM_SEED),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, use_label_encoder=False)
}

results = {}

for name, model in models.items():

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]
    
    auc = roc_auc_score(y_test, y_prob)
    acc = accuracy_score(y_test, y_pred)
    results[name] = {"Pipeline": pipeline, "AUC": auc, "Accuracy": acc}
    
    print(f"[{name}] AUC: {auc:.4f} | Accuracy: {acc:.4f}")

best_model_name = max(results, key=lambda x: results[x]['AUC'])
print(f"\nBest: {best_model_name}")

[LogisticRegression] AUC: 0.8903 | Accuracy: 0.8936
[RandomForest] AUC: 0.8901 | Accuracy: 0.8960
[XGBoost] AUC: 0.8903 | Accuracy: 0.8956

Best: LogisticRegression


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
from sklearn.metrics import confusion_matrix, classification_report

winning_pipeline = results[best_model_name]['Pipeline']
y_val_pred = winning_pipeline.predict(X_val)

print(f"--- Classification Report for {best_model_name} (Val Set) ---")
print(classification_report(y_val, y_val_pred))

cm = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:")
print(cm) 

--- Classification Report for LogisticRegression (Val Set) ---
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      8413
           1       0.83      0.60      0.70      2310

    accuracy                           0.89     10723
   macro avg       0.86      0.78      0.81     10723
weighted avg       0.88      0.89      0.88     10723

Confusion Matrix:
[[8128  285]
 [ 924 1386]]


In [7]:
from sklearn.metrics import f1_score
import numpy as np

y_val_probs = winning_pipeline.predict_proba(X_val)[:, 1]
thresholds = np.linspace(0, 1, 101)
f1_scores = []

for t in thresholds:
    y_pred_t = (y_val_probs >= t).astype(int)
    f1_scores.append(f1_score(y_val, y_pred_t))

best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Best Threshold for F1-Score: {best_threshold:.2f}")

Best Threshold for F1-Score: 0.49


## Hyperparameter Tuning

In [8]:
# the result above is pretty close, so below we will be trying different parameter combinaiton to see what performs well
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import HalvingGridSearchCV

param_grids = {
    "LogisticRegression": {
        'classifier__C': [0.1, 1, 10]
    },
    "RandomForest": {
        'classifier__n_estimators': [100, 200, 250],
        'classifier__max_depth': [5, 10, None]
    },
    "XGBoost": {
        'classifier__learning_rate': [0.01, 0.1, 1],
        'classifier__n_estimators': [100, 200, 250],
        'classifier__max_depth': [3, 5]
    }
}

results = {}

for name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    grid_search = HalvingGridSearchCV(
        pipeline, 
        param_grid=param_grids[name], 
        cv=3, 
        scoring='roc_auc',
        factor=2,
        n_jobs=1
    )
    
    grid_search.fit(X_train, y_train)
    
    best_pipeline = grid_search.best_estimator_
    y_prob = best_pipeline.predict_proba(X_test)[:, 1]
    
    auc = roc_auc_score(y_test, y_prob)
    results[name] = {"Pipeline": best_pipeline, "AUC": auc, "Params": grid_search.best_params_}
    
    print(f"[{name}] Best AUC: {auc:.4f} using {grid_search.best_params_}")

[LogisticRegression] Best AUC: 0.8903 using {'classifier__C': 0.1}
[RandomForest] Best AUC: 0.8903 using {'classifier__max_depth': 10, 'classifier__n_estimators': 250}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


[XGBoost] Best AUC: 0.8897 using {'classifier__learning_rate': 0.01, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}
