1- Import Libraries

In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [25]:
from pathlib import Path

data_path = Path("../processed data/hotel_bookings_cleaned.pkl")
df = pd.read_pickle(data_path)


2- Create new features

In [26]:

# 1️- Total stay nights
df['total_stay_nights'] = (
    df['stays_in_weekend_nights'] +
    df['stays_in_week_nights']
)

# 2️- Total number of guests
df['total_guests'] = (
    df['adults'] +
    df['children'] +
    df['babies']
)

df['total_guests'] = df['total_guests'].fillna(0)

# 3️- Lead time to stay ratio
df['lead_to_stay_ratio'] = np.where(
    df['total_stay_nights'] > 0,
    df['lead_time'] / df['total_stay_nights'],
    df['lead_time']
)

df['lead_to_stay_ratio'] = df['lead_to_stay_ratio'].replace(
    [np.inf, -np.inf], 0
)

# 4️- Booking change indicator
df['booking_changed'] = np.where(
    df['booking_changes'] > 0, 1, 0
)

# 5️- Parking indicator
df['has_parking'] = np.where(
    df['required_car_parking_spaces'] > 0, 1, 0
)

# 6️- Price per person
df['adr_per_person'] = df['adr'] / (df['total_guests'] + 1)

# 7️- Agent availability indicator
df['has_agent_flag'] = np.where(
    df['agent'] > 0, 1, 0
)
# Revenue 
df['revenue'] = df['adr'] * df['total_stay_nights']


# Month season
season_mapping = {
    'December': 'Winter', 'January': 'Winter', 'February': 'Winter',
    'March': 'Spring', 'April': 'Spring', 'May': 'Spring',
    'June': 'Summer', 'July': 'Summer', 'August': 'Summer',
    'September': 'Fall', 'October': 'Fall', 'November': 'Fall'
}
df['season'] = df['arrival_date_month'].map(season_mapping)


3-Drop columns Cause Data Leakage

In [27]:

df.drop(columns=['reservation_status', 'reservation_status_date'], inplace=True)


4-One-Hot Encoding 

In [28]:

# One-Hot Encoding

onehot_cols = [
    'market_segment',
    'distribution_channel',
    'deposit_type',
    'customer_type'
]

df = pd.get_dummies(df, columns=onehot_cols, drop_first=True)


5-Label Encoding

In [29]:
# Label Encoding 
le = LabelEncoder()
df['arrival_date_month'] = le.fit_transform(df['arrival_date_month'])





6-Scaling

In [30]:
 #Scaling 
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = numeric_cols.drop('is_canceled')
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])



7-Target

In [31]:
# Splitting Feautures and target
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']


8- Save

In [32]:
import pickle

# Define the path where all files will be saved
save_path = Path("../processed data")

# 1- Save the full DataFrame after feature engineering and encoding
df.to_pickle(f'{save_path}\\hotel_bookings_final.pkl')

# 2- Save the target variable
y = df['is_canceled']
with open(f'{save_path}\\target_variable.pkl', 'wb') as f:
    pickle.dump(y, f)

# 3- Save the features
feature_cols = df.drop('is_canceled', axis=1).columns.tolist()

with open(f'{save_path}\\feature_columns.pkl', 'wb') as f:
    pickle.dump(feature_cols, f)

# 4-Save the Scalar
with open(f'{save_path}\\scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)



