1- Import Libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier

In [10]:

file_path = r'D:\DAproject\Project-DA\processed data\hotel_bookings_cleaned.pkl'

df = pd.read_pickle(file_path)

print(df.head())
print(df.info())
print(df.describe())


          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  \
0  Resort Hotel            0        315               2015               July   
1  Resort Hotel            0        315               2015               July   
2  Resort Hotel            0          7               2015               July   
3  Resort Hotel            0         13               2015               July   
4  Resort Hotel            0         14               2015               July   

   arrival_date_week_number  arrival_date_day_of_month  \
0                        27                          1   
1                        27                          1   
2                        27                          1   
3                        27                          1   
4                        27                          1   

   stays_in_weekend_nights  stays_in_week_nights  adults  ...  deposit_type  \
0                        0                     0       2  ...    No Deposit   
1     

2- Create new features

In [16]:
df['total_stay_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']  # Total nights stayed
df['total_guests'] = df['adults'] + df['children'] + df['babies']                       # Total guests
df['lead_to_stay_ratio'] = df['lead_time'] / (df['total_stay_nights'] + 1)             # Lead time divided by stay nights (+1 to avoid division by zero)
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])
df['reservation_status_year'] = df['reservation_status_year'].astype(int)
df['reservation_status_month'] = df['reservation_status_month'].astype(int)
df['reservation_status_day'] = df['reservation_status_day'].astype(int)



3-Encode Categorical Features

In [17]:

# Identify categorical columns
cat_cols = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
            'distribution_channel', 'reserved_room_type', 'assigned_room_type',
            'deposit_type', 'customer_type', 'reservation_status' ]

# Apply Label Encoding
le = LabelEncoder()
for col in cat_cols:
    df[col] = df[col].astype(str)  # Convert all values to string
    df[col] = le.fit_transform(df[col])
print("\nAfter Encoding:")
for col in cat_cols:
    print(f"{col} unique values:", df[col].unique()[:10])



After Encoding:
hotel unique values: [1 0]
arrival_date_month unique values: [ 7  1  3  2 11  4  6  5  9  0]
meal unique values: [0 1 2 3 4]
country unique values: [ 42 135  80 126 153 131   0  47  30  33]
market_segment unique values: [3 2 6 5 1 4 7 0]
distribution_channel unique values: [1 0 3 4 2]
reserved_room_type unique values: [2 0 3 4 6 5 7 8 9 1]
assigned_room_type unique values: [ 4  0  5  6  8  7 10  1  9  3]
deposit_type unique values: [0 2 1]
customer_type unique values: [2 0 3 1]
reservation_status unique values: [1 0 2]


4-Prepare Feauture and Target

In [23]:

X_numeric = df.select_dtypes(include=['int64', 'float64']).drop(columns=['is_canceled'])
y = df['is_canceled']


5-Filter Method (ANOVA F-test)

In [24]:
from sklearn.feature_selection import SelectKBest, f_classif
import pandas as pd


selector_anova = SelectKBest(score_func=f_classif, k=8)
X_train_anova = selector_anova.fit_transform(X_numeric, y)

# Display top features
anova_scores = pd.Series(selector_anova.scores_, index=X_numeric.columns).sort_values(ascending=False)
print("Top 8 features (ANOVA F-test):")
print(anova_scores.head(8))


Top 8 features (ANOVA F-test):
reservation_status             328492.997951
lead_time                        3200.107435
required_car_parking_spaces      3069.989371
market_segment                   2990.466312
lead_to_stay_ratio               2589.702087
distribution_channel             2021.526695
deposit_type                     1685.036479
adr                              1616.858764
dtype: float64


6-Feature Selection with Lasso

In [26]:
from sklearn.linear_model import LassoCV
import pandas as pd


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)


lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)


lasso_coef = pd.Series(lasso.coef_, index=X_numeric.columns)


selected_features_lasso = lasso_coef[lasso_coef != 0].sort_values(ascending=False)
print("Selected features (Lasso):")
print(selected_features_lasso)


Selected features (Lasso):
market_segment               0.006110
total_stay_nights            0.004172
lead_to_stay_ratio           0.000594
arrival_date_week_number     0.000238
adr                          0.000209
arrival_date_month           0.000025
arrival_date_day_of_month    0.000017
lead_time                   -0.000024
agent                       -0.000047
days_in_waiting_list        -0.000218
country                     -0.000518
assigned_room_type          -0.001995
reservation_status_month    -0.003604
total_of_special_requests   -0.007749
reservation_status          -0.808549
dtype: float64


7-RFE (Recursive Feature Elimination)

In [27]:
from sklearn.model_selection import train_test_split


X_numeric = df.select_dtypes(include=['int64', 'float64']).drop(columns=['is_canceled'])
y = df['is_canceled']

X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)


In [28]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)


rfe = RFE(estimator=rf_model, n_features_to_select=15)
rfe.fit(X_train, y_train)


selected_features_rfe = X_train.columns[rfe.support_].tolist()
print("Selected features (RFE):")
print(selected_features_rfe)


Selected features (RFE):
['lead_time', 'arrival_date_week_number', 'country', 'market_segment', 'previous_cancellations', 'deposit_type', 'agent', 'customer_type', 'adr', 'required_car_parking_spaces', 'total_of_special_requests', 'reservation_status', 'total_stay_nights', 'lead_to_stay_ratio', 'reservation_status_month']
