<a href="https://colab.research.google.com/github/pakkei1212/SMU_AML_PROJ/blob/main/No%20Shows.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Stage 2 - Predicting Late Cancellations & No Shows

In [1]:
!pip install optuna



In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import roc_auc_score

hotel_data = pd.read_csv("hotel_bookings.csv")

month_map = {m: i for i, m in enumerate(
    ['January','February','March','April','May','June',
     'July','August','September','October','November','December'], 1)}
hotel_data['arrival_date'] = pd.to_datetime(
    hotel_data['arrival_date_year'].astype(str) + '-' +
    hotel_data['arrival_date_month'].map(month_map).astype(str) + '-' +
    hotel_data['arrival_date_day_of_month'].astype(str),
    errors='coerce'
)

df_stage2 = hotel_data[hotel_data['is_canceled'] == 1].copy()

df_stage2['room_type_match'] = (df_stage2['reserved_room_type'] == df_stage2['assigned_room_type']).astype(int)
df_stage2.drop(columns=['reserved_room_type', 'assigned_room_type'], inplace=True)

df_stage2['reservation_status_date'] = pd.to_datetime(df_stage2['reservation_status_date'], errors='coerce')
df_stage2['days_before_arrival'] = (df_stage2['arrival_date'] - df_stage2['reservation_status_date']).dt.days

df_stage2['late_cancel_or_noshow'] = (
    (df_stage2['reservation_status'] == 'No-Show') |
    ((df_stage2['reservation_status'] == 'Canceled') & (df_stage2['days_before_arrival'] <= 3))
).astype(int)

df_stage2['total_guests'] = df_stage2[['adults', 'children', 'babies']].sum(axis=1, min_count=1)
df_stage2['total_nights'] = df_stage2['stays_in_week_nights'] + df_stage2['stays_in_weekend_nights']

features_to_use = [
    'lead_time', 'previous_cancellations', 'previous_bookings_not_canceled',
    'booking_changes', 'days_in_waiting_list', 'adr',
    'required_car_parking_spaces', 'total_of_special_requests',
    'total_guests', 'total_nights', 'room_type_match',
    'hotel', 'deposit_type', 'customer_type',
    'market_segment', 'distribution_channel'
]

latest_date = df_stage2['arrival_date'].max()
test_cutoff = latest_date - pd.DateOffset(months=3)

df_trainval = df_stage2[df_stage2['arrival_date'] < test_cutoff]
df_test = df_stage2[df_stage2['arrival_date'] >= test_cutoff]

X_trainval = df_trainval[features_to_use]
y_trainval = df_trainval['late_cancel_or_noshow']
X_test = df_test[features_to_use]
y_test = df_test['late_cancel_or_noshow']

num_features = X_trainval.select_dtypes(include='number').columns.tolist()
cat_features = [f for f in features_to_use if f not in num_features]

preprocessor = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), num_features),
    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]), cat_features)
])

pipe = Pipeline([
    ('prep', preprocessor),
    ('clf', LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42,
        solver='liblinear'
    ))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipe, X_trainval, y_trainval, cv=cv, scoring='roc_auc')
print(f"CV AUC Scores: {cv_scores}")
print(f"Mean CV AUC: {np.mean(cv_scores):.4f}")

pipe.fit(X_trainval, y_trainval)
y_test_proba = pipe.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_proba)
print(f"\nHold-out Test AUC: {test_auc:.4f}")

ohe = pipe.named_steps['prep'].named_transformers_['cat'].named_steps['onehot']
cat_expanded = ohe.get_feature_names_out(cat_features)
feature_names = num_features + cat_expanded.tolist()

coef_series = pd.Series(pipe.named_steps['clf'].coef_[0], index=feature_names)
print("\nTop + coefficients:")
print(coef_series.sort_values(ascending=False).head(10))

print("\nTop – coefficients:")
print(coef_series.sort_values().head(10))


CV AUC Scores: [0.87870692 0.85724234 0.87399946 0.86828283 0.87287388]
Mean CV AUC: 0.8702

Hold-out Test AUC: 0.8407

Top + coefficients:
deposit_type_Refundable           0.983783
distribution_channel_Direct       0.839624
market_segment_Aviation           0.835572
deposit_type_No Deposit           0.683460
market_segment_Undefined          0.302662
hotel_City Hotel                  0.236801
total_of_special_requests         0.194265
distribution_channel_Undefined    0.185670
customer_type_Group               0.183050
market_segment_Offline TA/TO      0.150956
dtype: float64

Top – coefficients:
deposit_type_Non Refund      -1.663573
lead_time                    -1.317879
distribution_channel_TA/TO   -0.876645
market_segment_Online TA     -0.615430
market_segment_Direct        -0.520250
previous_cancellations       -0.440984
room_type_match              -0.363199
customer_type_Transient      -0.285451
hotel_Resort Hotel           -0.233131
total_guests                 -0.229453
dtyp

In [4]:
import optuna
from sklearn.base import clone

def objective(trial):
    # Sample hyperparameters
    solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    C = trial.suggest_float("C", 1e-3, 10, log=True)
    class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

    # Validate penalty/solver combinations
    if penalty == "l1" and solver not in ["liblinear"]:
        raise optuna.exceptions.TrialPruned()
    if penalty == "l2" and solver not in ["lbfgs", "liblinear"]:
        raise optuna.exceptions.TrialPruned()

    # Clone pipeline and set parameters
    pipe_trial = clone(pipe)
    pipe_trial.set_params(
        clf__C=C,
        clf__penalty=penalty,
        clf__solver=solver,
        clf__class_weight=class_weight,
        clf__max_iter=1000
    )

    # Cross-validation scoring
    score = cross_val_score(pipe_trial, X_trainval, y_trainval, cv=cv, scoring='roc_auc').mean()
    return score

# Run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15)

print("Best Params:", study.best_params)
print("Best CV AUC:", study.best_value)


[I 2025-06-21 11:25:02,539] A new study created in memory with name: no-name-5fb1a5f0-afa8-43f2-8f58-8bcdf4feb4ca
[I 2025-06-21 11:25:21,598] Trial 0 finished with value: 0.870338738861396 and parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.21645587666860452, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.870338738861396.
[I 2025-06-21 11:25:21,601] Trial 1 pruned. 
[I 2025-06-21 11:25:22,675] Trial 2 finished with value: 0.865803958934204 and parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 0.005249763070353185, 'class_weight': None}. Best is trial 0 with value: 0.870338738861396.
[I 2025-06-21 11:25:29,163] Trial 3 finished with value: 0.8703713797763906 and parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.014623554476057319, 'class_weight': 'balanced'}. Best is trial 3 with value: 0.8703713797763906.
[I 2025-06-21 11:25:29,167] Trial 4 pruned. 
[I 2025-06-21 11:25:49,047] Trial 5 finished with value: 0.8696701465678702 and parameters: {'so

Best Params: {'solver': 'liblinear', 'penalty': 'l1', 'C': 0.014623554476057319, 'class_weight': 'balanced'}
Best CV AUC: 0.8703713797763906


In [7]:
final_pipe = clone(pipe)
final_pipe.set_params(
    clf__C=study.best_params['C'],
    clf__penalty=study.best_params['penalty'],
    clf__solver=study.best_params['solver'],
    clf__class_weight=study.best_params['class_weight'],
    clf__max_iter=1000
)

final_pipe.fit(X_trainval, y_trainval)

# Evaluate on hold-out test
y_test_proba = final_pipe.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_proba)
print(f"\nTuned Hold-out Test AUC: {test_auc:.4f}")

# Feature importance via coefficients
ohe = final_pipe.named_steps['prep'].named_transformers_['cat'].named_steps['onehot']
cat_expanded = ohe.get_feature_names_out(cat_features)
feature_names = num_features + cat_expanded.tolist()

coef_series = pd.Series(final_pipe.named_steps['clf'].coef_[0], index=feature_names)
print("\nTop + coefficients:")
print(coef_series.sort_values(ascending=False).head(10))

print("\nTop – coefficients:")
print(coef_series.sort_values().head(10))



Tuned Hold-out Test AUC: 0.8381

Top + coefficients:
distribution_channel_Direct       0.455066
hotel_City Hotel                  0.365325
deposit_type_No Deposit           0.322868
total_of_special_requests         0.176388
market_segment_Offline TA/TO      0.111040
booking_changes                   0.057590
days_in_waiting_list              0.049451
previous_bookings_not_canceled    0.034443
distribution_channel_Undefined    0.000000
customer_type_Contract            0.000000
dtype: float64

Top – coefficients:
deposit_type_Non Refund      -1.861600
lead_time                    -1.271972
distribution_channel_TA/TO   -0.689187
market_segment_Online TA     -0.524803
room_type_match              -0.346600
customer_type_Transient      -0.269685
previous_cancellations       -0.244928
total_guests                 -0.222936
adr                          -0.156627
deposit_type_Refundable       0.000000
dtype: float64
