In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df = pd.read_csv('hotel_bookings.csv')

# Display the first few rows of the dataset
df.head()

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
# Check for missing values and general information
df.info()

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
# Preprocess the data
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df.fillna(0, inplace=True)

In [None]:
# Convert categorical columns to strings to ensure uniform data type
categorical_features = ['hotel', 'arrival_date_month', 'meal', 'country', 'market_segment',
                        'distribution_channel', 'reserved_room_type', 'assigned_room_type',
                        'deposit_type', 'agent', 'company', 'customer_type']
df[categorical_features] = df[categorical_features].astype(str)

In [None]:
# Define features and target
X = df[['hotel', 'lead_time', 'arrival_date_year', 'arrival_date_month', 'arrival_date_week_number',
        'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults',
        'children', 'babies', 'meal', 'country', 'market_segment', 'distribution_channel',
        'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled',
        'reserved_room_type', 'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
        'company', 'days_in_waiting_list', 'customer_type', 'required_car_parking_spaces',
        'total_of_special_requests']]
y = df['adr']

In [None]:
# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['lead_time', 'arrival_date_year', 'arrival_date_week_number', 
                                   'arrival_date_day_of_month', 'stays_in_weekend_nights', 
                                   'stays_in_week_nights', 'adults', 'children', 'babies', 
                                   'is_repeated_guest', 'previous_cancellations', 
                                   'previous_bookings_not_canceled', 'booking_changes', 
                                   'days_in_waiting_list', 'required_car_parking_spaces', 
                                   'total_of_special_requests']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create and train the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', LinearRegression())])
pipeline.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = pipeline.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

In [None]:
# Plot the true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel('True Values')
plt.ylabel('Predictions')
plt.title('True vs Predicted Values')
plt.show()

In [None]:
# Plot residuals
residuals = y_test - y_pred
sns.histplot(residuals, kde=True)
plt.title('Residuals Distribution')
plt.show()