1- Import Libraries

In [69]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [86]:

file_path = r'D:\DAproject\Project-DA\processed data\hotel_bookings_cleaned.pkl'

df = pd.read_pickle(file_path)
df


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,has_agent
0,Resort Hotel,0,315,2015,July,27,1,0,0,2,...,No Deposit,0.0,0,Transient,0.00,0,0,Check-Out,2015-07-01,True
1,Resort Hotel,0,315,2015,July,27,1,0,0,2,...,No Deposit,0.0,0,Transient,0.00,0,0,Check-Out,2015-07-01,True
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,0.0,0,Transient,75.00,0,0,Check-Out,2015-07-02,True
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,0,Transient,75.00,0,0,Check-Out,2015-07-02,True
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,0,Transient,98.00,0,1,Check-Out,2015-07-03,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,35,30,2,5,2,...,No Deposit,394.0,0,Transient,96.14,0,0,Check-Out,2017-09-06,True
119386,City Hotel,0,102,2017,August,35,31,2,5,3,...,No Deposit,9.0,0,Transient,225.43,0,2,Check-Out,2017-09-07,True
119387,City Hotel,0,34,2017,August,35,31,2,5,2,...,No Deposit,9.0,0,Transient,157.71,0,4,Check-Out,2017-09-07,True
119388,City Hotel,0,109,2017,August,35,31,2,5,2,...,No Deposit,89.0,0,Transient,104.40,0,0,Check-Out,2017-09-07,True


2- Create new features

In [71]:
df['total_stay_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']
df['total_guests'] = df['adults'] + df['children'] + df['babies']
df['lead_to_stay_ratio'] = np.where(
    df['total_stay_nights'] > 0,
    df['lead_time'] / df['total_stay_nights'],
    df['lead_time']
)

df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
df['reservation_status_year'] = df['reservation_status_date'].dt.year
df['reservation_status_month'] = df['reservation_status_date'].dt.month
df['reservation_status_day'] = df['reservation_status_date'].dt.day


3-Encode Categorical Features

In [72]:
cat_cols = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
            'distribution_channel', 'reserved_room_type', 'assigned_room_type',
            'deposit_type', 'customer_type', 'reservation_status']

le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = df[col].astype(str)
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le

print("\nAfter Encoding:")
for col in cat_cols:
    print(f"{col} unique values:", df[col].unique()[:10])


After Encoding:
hotel unique values: [1 0]
arrival_date_month unique values: [ 5  1 11 10  9  2  4  3  7  0]
meal unique values: [0 1 2 3 4]
country unique values: [136  60 170  52  77  57   0 140 125 128]
market_segment unique values: [3 2 6 5 1 4 7 0]
distribution_channel unique values: [1 0 3 4 2]
reserved_room_type unique values: [2 0 3 4 6 5 7 8 9 1]
assigned_room_type unique values: [ 2  0  3  4  6  5  8  1  7 11]
deposit_type unique values: [0 2 1]
customer_type unique values: [2 0 3 1]
reservation_status unique values: [1 0 2]


4-Prepare Feauture and Target

In [74]:

X_numeric = df.select_dtypes(include=['int64', 'float64']).drop(columns=['is_canceled'])
y = df['is_canceled']


5-Filter Method (ANOVA F-test)

In [75]:
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd


selector_anova = SelectKBest(score_func=f_classif, k=8)
X_train_anova = selector_anova.fit_transform(X_numeric, y)

# Display top features
anova_scores = pd.Series(selector_anova.scores_, index=X_numeric.columns).sort_values(ascending=False)
print("Top 8 features (ANOVA F-test):")
print(anova_scores.head(8))


Top 8 features (ANOVA F-test):
reservation_status             328492.997951
lead_time                        3200.107435
required_car_parking_spaces      3069.989371
market_segment                   2990.466312
lead_to_stay_ratio               2160.342147
distribution_channel             2021.526695
deposit_type                     1685.036479
adr                              1616.858764
dtype: float64


6-Feature Selection with Lasso

In [76]:
from sklearn.linear_model import LassoCV
import pandas as pd


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

#Standardize features for Lasso
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#  LassoCV for feature selection
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train_scaled, y_train)

#  Get non-zero coefficient features
lasso_coef = pd.Series(lasso.coef_, index=X_numeric.columns)
selected_features_lasso = lasso_coef[lasso_coef != 0].sort_values(ascending=False)
print("Selected features (Lasso):")
print(selected_features_lasso)



Selected features (Lasso):
country                           0.021794
market_segment                    0.017401
total_stay_nights                 0.013515
lead_to_stay_ratio                0.009800
reserved_room_type                0.009010
deposit_type                      0.006072
adr                               0.005177
previous_cancellations            0.003221
children                          0.002567
arrival_date_year                 0.001984
agent                             0.001668
arrival_date_day_of_month         0.001396
stays_in_weekend_nights           0.000037
customer_type                    -0.000085
arrival_date_month               -0.000428
is_repeated_guest                -0.001306
days_in_waiting_list             -0.002569
adults                           -0.002720
lead_time                        -0.002786
arrival_date_week_number         -0.003058
previous_bookings_not_canceled   -0.003222
booking_changes                  -0.006091
assigned_room_type         

7-RFE (Recursive Feature Elimination)

In [77]:
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)


In [78]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


rfe = RFE(estimator=rf_model, n_features_to_select=15)
rfe.fit(X_train, y_train)


selected_features_rfe = X_train.columns[rfe.support_].tolist()
print("Selected features (RFE):")
print(selected_features_rfe)


Selected features (RFE):
['lead_time', 'arrival_date_year', 'country', 'market_segment', 'previous_cancellations', 'booking_changes', 'deposit_type', 'agent', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status', 'total_stay_nights', 'lead_to_stay_ratio']


8- Save

In [82]:
import pickle

# Define the path where all files will be saved
save_path = r"D:\DAproject\Project-DA\processed data"

# 1- Save the full DataFrame after feature engineering and encoding
df.to_pickle(f'{save_path}\\hotel_bookings_final.pkl')

# 2- Save numeric features only (X_numeric)
X_numeric.to_pickle(f'{save_path}\\numeric_features.pkl')

# 3- Save selected features from ANOVA F-test (top 8)
X_anova = df[anova_scores.head(8).index]
with open(f'{save_path}\\anova_selected_features.pkl', 'wb') as f:
    pickle.dump(X_anova, f)

# 4- Save selected features from Lasso
X_lasso = df[selected_features_lasso.index]
with open(f'{save_path}\\lasso_selected_features.pkl', 'wb') as f:
    pickle.dump(X_lasso, f)

# 5- Save selected features from RFE
X_rfe = df[selected_features_rfe]
with open(f'{save_path}\\rfe_selected_features.pkl', 'wb') as f:
    pickle.dump(X_rfe, f)

# 6- Save the target variable
y = df['is_canceled']
with open(f'{save_path}\\target_variable.pkl', 'wb') as f:
    pickle.dump(y, f)

# 7- Save the label encoders for categorical variables
with open(f'{save_path}\\label_encoders.pkl', 'wb') as f:
    pickle.dump(le_dict, f)

