<a href="https://colab.research.google.com/github/pakkei1212/SMU_AML_PROJ/blob/no_shows/No_Shows%3F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [4]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from xgboost import XGBClassifier
from hotel_preprocessing import XYPreprocessor
import numpy as np
import pandas as pd
import optuna


RANDOM_STATE = 2025
VERBOSE = 2

In [5]:
# === Load data and preprocess arrival date ===
raw_hotel_data = pd.read_csv("hotel_bookings.csv")
hotel_data = raw_hotel_data.copy()

hotel_data['arrival_date'] = pd.to_datetime(
    hotel_data['arrival_date_year'].astype(str) + '-' +
    hotel_data['arrival_date_month'] + '-' +
    hotel_data['arrival_date_day_of_month'].astype(str),
    format='%Y-%B-%d'
)
hotel_data['reservation_status_date'] = pd.to_datetime(hotel_data['reservation_status_date'])

# Define target: 1 if No-Show OR cancelled within 3 days before arrival, else 0
hotel_data['days_before_arrival'] = (hotel_data['arrival_date'] - hotel_data['reservation_status_date']).dt.days
hotel_data['late_or_noshow'] = ((hotel_data['reservation_status'] == 'No-Show') |
                                 ((hotel_data['reservation_status'] == 'Canceled') & (hotel_data['days_before_arrival'] <= 3))).astype(int)

# === Filter to only canceled reservations for stage 2 analysis ===
hotel_data = hotel_data[hotel_data['is_canceled'] == 1].reset_index(drop=True)

# === Split train/test by arrival_date ===
last_date = hotel_data['arrival_date'].max()
cutoff_date = last_date - pd.DateOffset(months=3)

test_data = hotel_data[hotel_data['arrival_date'] > cutoff_date].reset_index(drop=True)
train_val_data = hotel_data[hotel_data['arrival_date'] <= cutoff_date].reset_index(drop=True)

cols_to_drop = [
    'reservation_status', 'reservation_status_date', 'arrival_date',
    'days_before_arrival', 'is_canceled'
]

X_train_val_raw = train_val_data.drop(columns=cols_to_drop + ['late_or_noshow'])
y_train_val_raw = train_val_data['late_or_noshow']

X_test_raw = test_data.drop(columns=cols_to_drop + ['late_or_noshow'])
y_test_raw = test_data['late_or_noshow']

# === Preprocess ===
preprocessor = XYPreprocessor()
X_train_val, y_train_val = preprocessor.fit_transform(X_train_val_raw, y_train_val_raw)
X_test, y_test = preprocessor.transform(X_test_raw, y_test_raw)

# === Optuna objective ===
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'scale_pos_weight': (y_train_val == 0).sum() / (y_train_val == 1).sum(),
        'eval_metric': 'auc',
        'random_state': 42
    }

    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, val_idx in kf.split(X_train_val, y_train_val):
        X_tr = X_train_val.iloc[train_idx].reset_index(drop=True)
        y_tr = y_train_val.iloc[train_idx].reset_index(drop=True)
        X_val = X_train_val.iloc[val_idx].reset_index(drop=True)
        y_val = y_train_val.iloc[val_idx].reset_index(drop=True)

        model = XGBClassifier(**params)
        model.fit(X_tr, y_tr)
        y_val_pred = model.predict_proba(X_val)[:, 1]
        val_auc = roc_auc_score(y_val, y_val_pred)
        aucs.append(val_auc)

    return np.mean(aucs)

# === Run Optuna ===
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("\n=== Best Parameters ===")
print(study.best_params)

[I 2025-06-15 15:24:44,350] A new study created in memory with name: no-name-fa348224-72d2-4ed1-a44f-fc5b13dbb3f4
[I 2025-06-15 15:24:52,407] Trial 0 finished with value: 0.8647193817952132 and parameters: {'max_depth': 8, 'learning_rate': 0.25607990915021606, 'n_estimators': 330, 'min_child_weight': 4, 'subsample': 0.882591247558675, 'colsample_bytree': 0.7377914646114823, 'reg_alpha': 0.24827277365327582, 'reg_lambda': 3.7153402015729835}. Best is trial 0 with value: 0.8647193817952132.
[I 2025-06-15 15:25:02,408] Trial 1 finished with value: 0.8698771057727146 and parameters: {'max_depth': 10, 'learning_rate': 0.14281398576986162, 'n_estimators': 270, 'min_child_weight': 6, 'subsample': 0.8574146216937553, 'colsample_bytree': 0.6405644978341395, 'reg_alpha': 2.271476177057684, 'reg_lambda': 1.840058782038339}. Best is trial 1 with value: 0.8698771057727146.
[I 2025-06-15 15:25:06,556] Trial 2 finished with value: 0.875744846904136 and parameters: {'max_depth': 6, 'learning_rate': 0.


=== Best Parameters ===
{'max_depth': 9, 'learning_rate': 0.012789275056544586, 'n_estimators': 499, 'min_child_weight': 9, 'subsample': 0.5377038643350359, 'colsample_bytree': 0.9896713859183941, 'reg_alpha': 3.984728205644104, 'reg_lambda': 1.163627508264638}


In [6]:
# === Evaluate final model with best params ===
best_params = study.best_params
best_params.update({
    'scale_pos_weight': (y_train_val == 0).sum() / (y_train_val == 1).sum(),
    'eval_metric': 'auc',
    'random_state': 42
})

model = XGBClassifier(**best_params)
model.fit(X_train_val, y_train_val)
y_test_pred_proba = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_pred_proba >= 0.5).astype(int)

test_auc = roc_auc_score(y_test, y_test_pred_proba)
print(f"\nFinal Test ROC AUC: {test_auc:.4f}")

# === Classification report and confusion matrix ===
print("\nClassification Report (threshold 0.5):")
print(classification_report(y_test, y_test_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


Final Test ROC AUC: 0.8697

Classification Report (threshold 0.5):
              precision    recall  f1-score   support

           0       0.98      0.89      0.93      4330
           1       0.29      0.69      0.41       294

    accuracy                           0.88      4624
   macro avg       0.64      0.79      0.67      4624
weighted avg       0.93      0.88      0.90      4624


Confusion Matrix:
[[3842  488]
 [  90  204]]


In [7]:
# === Feature Importances ===
importances = model.feature_importances_
feature_names = X_train_val.columns
feat_imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='importance', ascending=False)

print("\nTop 10 Feature Importances:")
print(feat_imp_df.head(10))



Top 10 Feature Importances:
                       feature  importance
14             room_type_match    0.096094
31  distribution_channel_TA/TO    0.071027
10               lead_time_log    0.069812
34     deposit_type_Non Refund    0.066422
26    market_segment_Online TA    0.051882
33     deposit_type_No Deposit    0.042948
0                    lead_time    0.040887
58          agent_binned_240.0    0.027313
38     customer_type_Transient    0.020177
60          agent_binned_Other    0.016219
