# 0) Setup and load data

In [2]:
import os
import pickle
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

DATA_PATH = os.path.join("..", "data", "hotel_bookings.csv")

df = pd.read_csv(DATA_PATH)
df.shape, df.columns[:10]

((119390, 32),
 Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
        'arrival_date_month', 'arrival_date_week_number',
        'arrival_date_day_of_month', 'stays_in_weekend_nights',
        'stays_in_week_nights', 'adults'],
       dtype='object'))

# 1) Feature engineering

In [3]:
df_model = df.copy()

#basic target
y = df_model["is_canceled"].astype(float)

#simple, stable features (mostly numeric)
df_model["total_guests"] = (
    df_model.get("adults", 0).fillna(0)
    + df_model.get("children", 0).fillna(0)
    + df_model.get("babies", 0).fillna(0)
)

#binary encode hotel type (City=1, Resort=0)
df_model["hotel_city"] = (df_model["hotel"] == "City Hotel").astype(int)
feature_cols = [
    "lead_time",
    "adr",
    "stays_in_week_nights",
    "stays_in_weekend_nights",
    "total_guests",
    "hotel_city",
    "previous_cancellations",
    "booking_changes"
]

X_df = df_model[feature_cols].copy()

#quickly hande missing values
X_df = X_df.replace([np.inf, -np.inf], np.nan)
mask = X_df.notna().all(axis=1) & y.notna()

X = X_df.loc[mask].to_numpy(dtype=float)
y = y.loc[mask].to_numpy(dtype=float)

X.shape, y.shape

((119390, 8), (119390,))

# 2) Train/test split

In [4]:
#1) reproducible shuffle
rng = np.random.default_rng(42)
n = X.shape[0]
idx = rng.permutation(n)

#2) train/test split (80/20)
split = int(0.8 * n)
train_idx = idx[:split]
test_idx = idx[split:]

X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes:", X_test.shape, y_test.shape)

#3) z-score standardization
mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
sigma[sigma == 0] = 1.0

X_train_s = (X_train - mu) / sigma
X_test_s = (X_test - mu) / sigma

#4) intercept (design matrix)
X_train_design = np.column_stack([np.ones(X_train_s.shape[0]), X_train_s])
X_test_design = np.column_stack([np.ones(X_test_s.shape[0]), X_test_s])

print("Design shapes:", X_train_design.shape, X_test_design.shape)
print("First row (train):", X_train_design[0])



Train shapes: (95512, 8) (95512,)
Test shapes: (23878, 8) (23878,)
Design shapes: (95512, 9) (23878, 9)
First row (train): [ 1.         -0.76832031  0.72694085  0.26633103  0.07679599  0.04223071
  0.70918444 -0.10225844 -0.33922077]


# 3) Fit a NumPy Linear Model

In [6]:
#fit model using least squares: minimize squared errors
beta, residuals, ran, s = np.linalg.lstsq(X_train_design, y_train, rcond=None)

#predictions on train and test
y_pred_train = X_train_design @ beta
y_pred_test = X_test_design @ beta

#quick sanity checks (first 5 predictions)
print("beta shape:", beta.shape)
print("First 5 train predictions:", y_pred_train[:5])
print("First 5 test predictions: ", y_pred_test[:5])

#clip predictions to [0,1] for interpretability
y_pred_test_clipped = np.clip(y_pred_test, 0, 1)
print("Test predictions clipped range:", y_pred_test_clipped.min(), y_pred_test_clipped.max())


beta shape: (9,)
First 5 train predictions: [0.34251092 0.31006186 0.15642004 0.44584317 0.09932604]
First 5 test predictions:  [0.33954775 0.83666817 0.52176555 0.17107856 0.60683405]
Test predictions clipped range: 0.0 1.0


# 4) Model evaluation and baseline performance

In [7]:
#evaluation metrics
def mse(y_true, y_pred):                
    return np.mean((y_true - y_pred) ** 2)

def r2(y_true, y_pred):             #how much of the variance is explained by the model
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - ss_res / ss_tot if ss_tot != 0 else np.nan

#model performance
train_mse = mse(y_train, y_pred_train)          #how the model adapts to training data
test_mse = mse(y_test, y_pred_test)             #how good the model generalizes

train_r2 = r2(y_train, y_pred_train)
test_r2 = r2(y_test, y_pred_test)

print("Model performance:")
print(f"Train MSE: {train_mse:.4f}")
print(f"Test  MSE: {test_mse:.4f}")
print(f"Train R²:  {train_r2:.4f}")
print(f"Test  R²:  {test_r2:.4f}")

#baseline model
baseline_pred_train = np.full_like(y_train, y_train.mean())             #use a naive model, always predicts the same value
baseline_pred_test = np.full_like(y_test, y_train.mean())

baseline_train_mse = mse(y_train, baseline_pred_train)              #check how good is the model compared to baseline
baseline_test_mse = mse(y_test, baseline_pred_test)


print("\nBaseline performance:")
print(f"Baseline Train MSE: {baseline_train_mse:.4f}")
print(f"Baseline Test  MSE: {baseline_test_mse:.4f}")

Model performance:
Train MSE: 0.2030
Test  MSE: 0.2038
Train R²:  0.1297
Test  R²:  0.1261

Baseline performance:
Baseline Train MSE: 0.2332
Baseline Test  MSE: 0.2332


**Model Performance Interpretation**

The model outperforms the baseline that predicts the mean cancellation rate, reducing the test MSE from 0.2332 to 0.2038. This indicates that booking characteristics contain useful predictive information.
The R² values are relatively low (around 0.13), which is expected given the binary target variable and the use of a linear probability model. The similar performance on training and test sets suggests that the model generalizes well and does not overfit.
Overall, the model is primarily useful for interpretation of key drivers of cancellations rather than precise individual predictions.

# 5) Logistic Regression for Cancellation Prediction