1- Import Libraries

In [24]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:
from pathlib import Path

file_path = Path("../processed data/hotel_bookings_cleaned.pkl")

df = pd.read_pickle(file_path)
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
Index: 87389 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           87389 non-null  object 
 1   is_canceled                     87389 non-null  int64  
 2   lead_time                       87389 non-null  int64  
 3   arrival_date_year               87389 non-null  int64  
 4   arrival_date_month              87389 non-null  object 
 5   arrival_date_week_number        87389 non-null  int64  
 6   arrival_date_day_of_month       87389 non-null  int64  
 7   stays_in_weekend_nights         87389 non-null  int64  
 8   stays_in_week_nights            87389 non-null  int64  
 9   adults                          87389 non-null  int64  
 10  children                        87389 non-null  float64
 11  babies                          87389 non-null  int64  
 12  meal                            8738

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,has_agent
0,Resort Hotel,0,315,2015,July,27,1,0,0,2,...,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-07-01,True
1,Resort Hotel,0,315,2015,July,27,1,0,0,2,...,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-07-01,True
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,0.0,0,Transient,75.0,0,0,Check-Out,2015-07-02,True
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,0,Transient,75.0,0,0,Check-Out,2015-07-02,True
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,2015-07-03,True


2- Create new features

In [26]:

# 1️- Total stay nights
df['total_stay_nights'] = (
    df['stays_in_weekend_nights'] +
    df['stays_in_week_nights']
)

# 2️- Total number of guests
df['total_guests'] = (
    df['adults'] +
    df['children'] +
    df['babies']
)

df['total_guests'] = df['total_guests'].fillna(0)

# 3️- Lead time to stay ratio
df['lead_to_stay_ratio'] = np.where(
    df['total_stay_nights'] > 0,
    df['lead_time'] / df['total_stay_nights'],
    df['lead_time']
)

df['lead_to_stay_ratio'] = df['lead_to_stay_ratio'].replace(
    [np.inf, -np.inf], 0
)

# 4️- Booking change indicator
df['booking_changed'] = np.where(
    df['booking_changes'] > 0, 1, 0
)

# 5️- Parking indicator
df['has_parking'] = np.where(
    df['required_car_parking_spaces'] > 0, 1, 0
)

# 6️- Price per person
df['adr_per_person'] = df['adr'] / (df['total_guests'] + 1)

# 7️- Agent availability indicator
df['has_agent_flag'] = np.where(
    df['agent'] > 0, 1, 0
)


3-Drop columns Cause Data Leakage

In [27]:

df.drop(columns=['reservation_status', 'reservation_status_date'], inplace=True)


4-One-Hot Encoding 

In [28]:

# One-Hot Encoding
categorical_cols = [
    'hotel',
    'meal',
    'market_segment',
    'distribution_channel',
    'deposit_type',
    'customer_type',
    'reserved_room_type',
    'assigned_room_type'
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


5-Label Encoding

In [29]:
# Label Encoding 
le = LabelEncoder()
df['arrival_date_month'] = le.fit_transform(df['arrival_date_month'])


6-Scaling

In [30]:
 #Scaling 
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = numeric_cols.drop('is_canceled')
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])



7-Target

In [31]:
# Splitting Feautures and target
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']


8- Save

In [None]:
import pickle
from pathlib import Path

# Define the path where all files will be saved
save_path = Path("../processed data")
save_path.mkdir(parents=True, exist_ok=True)

# 1- Save the full DataFrame after feature engineering and encoding
df.to_pickle(save_path / "hotel_bookings_final.pkl")

# 2- Save the target variable
y = df['is_canceled']
with open(save_path / "target_variable.pkl", 'wb') as f:
    pickle.dump(y, f)

# 3- Save the features
feature_cols = df.drop('is_canceled', axis=1).columns.tolist()
with open(save_path / "feature_columns.pkl", 'wb') as f:
    pickle.dump(feature_cols, f)

# 4-Save the scaler
with open(save_path / "scaler.pkl", 'wb') as f:
    pickle.dump(scaler, f)

