1- Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [3]:

file_path = r'D:\DAproject\Project-DA\processed data\hotel_bookings_cleaned.pkl'

df = pd.read_pickle(file_path)
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
Index: 87389 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           87389 non-null  object 
 1   is_canceled                     87389 non-null  int64  
 2   lead_time                       87389 non-null  int64  
 3   arrival_date_year               87389 non-null  int64  
 4   arrival_date_month              87389 non-null  object 
 5   arrival_date_week_number        87389 non-null  int64  
 6   arrival_date_day_of_month       87389 non-null  int64  
 7   stays_in_weekend_nights         87389 non-null  int64  
 8   stays_in_week_nights            87389 non-null  int64  
 9   adults                          87389 non-null  int64  
 10  children                        87389 non-null  float64
 11  babies                          87389 non-null  int64  
 12  meal                            8738

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,has_agent
0,Resort Hotel,0,315,2015,July,27,1,0,0,2,...,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-07-01,True
1,Resort Hotel,0,315,2015,July,27,1,0,0,2,...,No Deposit,0.0,0,Transient,0.0,0,0,Check-Out,2015-07-01,True
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,0.0,0,Transient,75.0,0,0,Check-Out,2015-07-02,True
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,0,Transient,75.0,0,0,Check-Out,2015-07-02,True
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,0,Transient,98.0,0,1,Check-Out,2015-07-03,True


2- Create new features

In [4]:

# 1️- Total stay nights
df['total_stay_nights'] = (
    df['stays_in_weekend_nights'] +
    df['stays_in_week_nights']
)

# 2️- Total number of guests
df['total_guests'] = (
    df['adults'] +
    df['children'] +
    df['babies']
)

df['total_guests'] = df['total_guests'].fillna(0)

# 3️- Lead time to stay ratio
df['lead_to_stay_ratio'] = np.where(
    df['total_stay_nights'] > 0,
    df['lead_time'] / df['total_stay_nights'],
    df['lead_time']
)

df['lead_to_stay_ratio'] = df['lead_to_stay_ratio'].replace(
    [np.inf, -np.inf], 0
)

# 4️- Booking change indicator
df['booking_changed'] = np.where(
    df['booking_changes'] > 0, 1, 0
)

# 5️- Parking indicator
df['has_parking'] = np.where(
    df['required_car_parking_spaces'] > 0, 1, 0
)

# 6️- Price per person
df['adr_per_person'] = df['adr'] / (df['total_guests'] + 1)

# 7️- Agent availability indicator
df['has_agent_flag'] = np.where(
    df['agent'] > 0, 1, 0
)


3-Drop columns Cause Data Leakage

In [5]:

df.drop(columns=['reservation_status', 'reservation_status_date'], inplace=True)


4-One-Hot Encoding 

In [6]:

# One-Hot Encoding
categorical_cols = [
    'hotel',
    'meal',
    'market_segment',
    'distribution_channel',
    'deposit_type',
    'customer_type',
    'reserved_room_type',
    'assigned_room_type'
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


5-Label Encoding

In [7]:
# Label Encoding 
le = LabelEncoder()
df['arrival_date_month'] = le.fit_transform(df['arrival_date_month'])


6-Scaling

In [76]:
from sklearn.linear_model import LassoCV
import pandas as pd


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

#Standardize features for Lasso
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  LassoCV for feature selection
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train_scaled, y_train)

#  Get non-zero coefficient features
lasso_coef = pd.Series(lasso.coef_, index=X_numeric.columns)
selected_features_lasso = lasso_coef[lasso_coef != 0].sort_values(ascending=False)
print("Selected features (Lasso):")
print(selected_features_lasso)



Selected features (Lasso):
country                           0.021794
market_segment                    0.017401
total_stay_nights                 0.013515
lead_to_stay_ratio                0.009800
reserved_room_type                0.009010
deposit_type                      0.006072
adr                               0.005177
previous_cancellations            0.003221
children                          0.002567
arrival_date_year                 0.001984
agent                             0.001668
arrival_date_day_of_month         0.001396
stays_in_weekend_nights           0.000037
customer_type                    -0.000085
arrival_date_month               -0.000428
is_repeated_guest                -0.001306
days_in_waiting_list             -0.002569
adults                           -0.002720
lead_time                        -0.002786
arrival_date_week_number         -0.003058
previous_bookings_not_canceled   -0.003222
booking_changes                  -0.006091
assigned_room_type         

7-RFE (Recursive Feature Elimination)

In [77]:
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)


In [78]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


rfe = RFE(estimator=rf_model, n_features_to_select=15)
rfe.fit(X_train, y_train)


selected_features_rfe = X_train.columns[rfe.support_].tolist()
print("Selected features (RFE):")
print(selected_features_rfe)


Selected features (RFE):
['lead_time', 'arrival_date_year', 'country', 'market_segment', 'previous_cancellations', 'booking_changes', 'deposit_type', 'agent', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status', 'total_stay_nights', 'lead_to_stay_ratio']


8- Save

In [82]:
import pickle

# Define the path where all files will be saved
save_path = r"D:\DAproject\Project-DA\processed data"

# 1- Save the full DataFrame after feature engineering and encoding
df.to_pickle(f'{save_path}\\hotel_bookings_final.pkl')

# 2- Save numeric features only (X_numeric)
X_numeric.to_pickle(f'{save_path}\\numeric_features.pkl')

# 3- Save selected features from ANOVA F-test (top 8)
X_anova = df[anova_scores.head(8).index]
with open(f'{save_path}\\anova_selected_features.pkl', 'wb') as f:
    pickle.dump(X_anova, f)

# 4- Save selected features from Lasso
X_lasso = df[selected_features_lasso.index]
with open(f'{save_path}\\lasso_selected_features.pkl', 'wb') as f:
    pickle.dump(X_lasso, f)

# 5- Save selected features from RFE
X_rfe = df[selected_features_rfe]
with open(f'{save_path}\\rfe_selected_features.pkl', 'wb') as f:
    pickle.dump(X_rfe, f)

# 6- Save the target variable
y = df['is_canceled']
with open(f'{save_path}\\target_variable.pkl', 'wb') as f:
    pickle.dump(y, f)

# 7- Save the label encoders for categorical variables
with open(f'{save_path}\\label_encoders.pkl', 'wb') as f:
    pickle.dump(le_dict, f)

