# 0) Setup and load data

In [1]:
import os
import pickle
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

DATA_PATH = os.path.join("..", "data", "hotel_bookings.csv")

df = pd.read_csv(DATA_PATH)
df.shape, df.columns[:10]

((119390, 32),
 Index(['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
        'arrival_date_month', 'arrival_date_week_number',
        'arrival_date_day_of_month', 'stays_in_weekend_nights',
        'stays_in_week_nights', 'adults'],
       dtype='object'))

# 1) Feature engineering

In [9]:
df_model = df.copy()

#basic target
y = df_model["is_canceled"].astype(float)

#simple, stable features (mostly numeric)
df_model["total_guests"] = (
    df_model.get("adults", 0).fillna(0)
    + df_model.get("children", 0).fillna(0)
    + df_model.get("babies", 0).fillna(0)
)

#binary encode hotel type (City=1, Resort=0)
df_model["hotel_city"] = (df_model["hotel"] == "City Hotel").astype(int)
feature_cols = [
    "lead_time",
    "adr",
    "stays_in_week_nights",
    "stays_in_weekend_nights",
    "total_guests",
    "hotel_city",
    "previous_cancellations",
    "booking_changes"
]

X_df = df_model[feature_cols].copy()

#quickly hande missing values
X_df = X_df.replace([np.inf, -np.inf], np.nan)
mask = X_df.notna().all(axis=1) & y.notna()

X = X_df.loc[mask].to_numpy(dtype=float)
y = y.loc[mask].to_numpy(dtype=float)

X.shape, y.shape

((119390, 8), (119390,))

# 2) Train/test split

In [10]:
#1) reproducible shuffle
rng = np.random.default_rng(42)
n = X.shape[0]
idx = rng.permutation(n)

#2) train/test split (80/20)
split = int(0.8 * n)
train_idx = idx[:split]
test_idx = idx[split:]

X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[test_idx], y[test_idx]

print("Train shapes:", X_train.shape, y_train.shape)
print("Test shapes:", X_test.shape, y_test.shape)

#3) z-score standardization
mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
sigma[sigma == 0] = 1.0

X_train_s = (X_train - mu) / sigma
X_test_s = (X_test - mu) / sigma

#4) intercept (design matrix)
X_train_design = np.column_stack([np.ones(X_train_s.shape[0]), X_train_s])
X_test_design = np.column_stack([np.ones(X_test_s.shape[0]), X_test_s])

print("Design shapes:", X_train_design.shape, X_test_design.shape)
print("First row (train):", X_train_design[0])



Train shapes: (95512, 8) (95512,)
Test shapes: (23878, 8) (23878,)
Design shapes: (95512, 9) (23878, 9)
First row (train): [ 1.         -0.76832031  0.72694085  0.26633103  0.07679599  0.04223071
  0.70918444 -0.10225844 -0.33922077]
