In [122]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Data Loading and Preprocessing:

- **Data Preparation:**
  - The dataset `booking.csv` is loaded and relevant columns are selected.
  - Categorical variables (`type_of_meal_plan`, `room_type_reserved`, `booking_status`) are encoded using `LabelEncoder`.
  
- **Data Splitting:**
  - The dataset is split into training and testing sets using `train_test_split`.
  
- **Feature Scaling:**
  - Features are standardized using `StandardScaler` to ensure all features contribute equally to the model.

In [5]:
# Load data and select relevant columns
data = pd.read_csv("booking.csv")
data = data[['no_of_adults', 'no_of_children', 'no_of_weekend_nights','no_of_week_nights', 'type_of_meal_plan','room_type_reserved','no_of_previous_cancellations','avg_price_per_room','booking_status']]

In [4]:
labelencoder = preprocessing.LabelEncoder()

In [9]:
# Encode categorical variables
data['type_of_meal_plan'] = labelencoder.fit_transform(data['type_of_meal_plan'])
data['room_type_reserved'] = labelencoder.fit_transform(data['room_type_reserved'])
data['booking_status'] = labelencoder.fit_transform(data['booking_status'])

In [13]:
# Define features (x) and target (y)
x = data[['no_of_adults', 'no_of_children', 'no_of_weekend_nights','no_of_week_nights', 'type_of_meal_plan','room_type_reserved','no_of_previous_cancellations','avg_price_per_room']]
y = data[['booking_status']]

In [14]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3)

In [106]:
# Standardize features
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(x_train)
scaled_x_test = scaler.fit_transform(x_test)

# Model Training and Evaluation

## Logistic Regression
 - A `LogisticRegression` model is instantiated.
  - The scaled training data (`scaled_x_train`) and target (`y_train`) are used to fit the model.
  
- **Model Evaluation:**
  - Predictions are made on the scaled test data (`scaled_x_test`).
  - The accuracy score of the model is computed using `accuracy_score` and printed.

In [128]:
# Function to print model accuracy
def modelresults(predictions):
    print(f"Accuracy score of the model is {accuracy_score(y_test,predictions)}")

In [21]:
# Logistic Regression model
log_model = LogisticRegression()
log_model.fit(scaled_x_train, y_train)

  y = column_or_1d(y, warn=True)


In [133]:
log_predictions = log_model.predict(scaled_x_test) 
modelresults(log_predictions)

Accuracy score of the model is 0.663603785720849


## K Nearest Neighbors (KNN) 
K Nearest Neighbors is a non-parametric model used for classification. The training and evaluation process with grid search:

- **Model Setup:**
  - A `KNeighborsClassifier` is initialized.
  
- **Hyperparameter Tuning:**
  - A pipeline (`Pipeline`) is created with scaling and KNN.
  - Grid search (`GridSearchCV`) is used to find the optimal number of neighbors (`k`) through cross-validation (`cv=5`).
  
- **Grid Search Execution:**
  - The best `k` value is determined based on the highest accuracy score during cross-validation.
  
- **Model Training with Best Parameters:**
  - The optimal KNN model is re-trained using the scaled training data (`scaled_x_train`).
  
- **Model Evaluation:**
  - Predictions are made on the scaled test data (`scaled_x_test`) using the re-trained model.
  - The accuracy score of the model is computed and printed.

In [43]:
# K Nearest Neighbors model with Grid Search

knn = KNeighborsClassifier()
# Define the range of k values
k_values = list(range(1, 30))

# Create a pipeline for KNN with scaling
pipe = Pipeline(steps=[("knn", knn)])

# Define parameters grid for GridSearchCV
param_grid = {"knn__n_neighbors": k_values}

# Perform Grid Search Cross-Validation
cv_classifier = GridSearchCV(pipe, param_grid, cv=5, scoring="accuracy")
cv_classifier.fit(scaled_x_train, y_train.values.ravel())

# Print the best parameters
print(cv_classifier.best_params_)

{'knn__n_neighbors': 9}


In [130]:
#{'knn__n_neighbors': 9}
# Make predictions using the best model found
pred_gridknn = cv_classifier.predict(scaled_x_test)
modelresults(pred_gridknn)

Accuracy score of the model is 0.7175411191766976


## Random Forest

Random Forest is an ensemble learning method for classification. It's trained and evaluated with grid search:

- **Model Setup:**
  - A `RandomForestClassifier` is instantiated.
  
- **Hyperparameter Tuning:**
  - Grid search (`GridSearchCV`) is applied to find the optimal combination of `n_estimators`, `max_features`, `bootstrap`, and `oob_score`.
  
- **Grid Search Execution:**
  - The best combination of hyperparameters is determined based on the highest accuracy score during cross-validation.
  
- **Model Training with Best Parameters:**
  - The optimal Random Forest model is re-trained using the scaled training data (`scaled_x_train`).
  
- **Model Evaluation:**
  - Predictions are made on the scaled test data (`scaled_x_test`) using the re-trained model.
  - The accuracy score of the model is computed and printed.

In [30]:
# Random Forest model with Grid Search
rfr_model = RandomForestClassifier()

# Define ranges for hyperparameters
n_estimators = [32,64,128]
max_features = [2,3]
bootstrap = [True, False]
oob_score = [True, False]

# Define parameter grid for GridSearchCV
param_grid_rfr = {"n_estimators":n_estimators, "max_features": max_features, "bootstrap":bootstrap, "oob_score":oob_score}

# Perform Grid Search Cross-Validation
grid_rfr = GridSearchCV(rfr_model, param_grid_rfr)
grid_rfr.fit(scaled_x_train,y_train.values.ravel())
print(grid_rfr.best_params_)

30 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects\pyenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects\pyenv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lucia\Desktop\Desktop\Portfolio\Data Science Projects\data-science-projects

{'bootstrap': True, 'max_features': 3, 'n_estimators': 32, 'oob_score': False}


In [129]:
# Make predictions using the best model found
predsrfc = grid_rfr.predict(scaled_x_test)
modelresults(predsrfc)

Accuracy score of the model is 0.6912616006615823


## Prediction for Customer Data

After training the models, they are used to predict whether a new customer will cancel their booking:

- **Customer Data Preparation:**
  - Example data (`customerdata`) representing a customer who hasn't canceled is scaled using the previously fitted `StandardScaler`.
  
- **Prediction Using KNN Model:**
  - The scaled customer data is fed into the trained KNN model (`cv_classifier`) to predict cancellation status.
  
- **Interpreting Prediction:**
  - A function (`cancellation`) interprets the prediction result:
    - If predicted as `1`, the customer "won't cancel the booking".
    - If predicted as `0`, the customer "will cancel the booking".
  
- **Output:**
  - The interpretation is printed based on the KNN model's prediction.

In [125]:
# Example customer data for prediction
customerdata = np.array([2, 0, 1, 2, 0, 0, 0, 65]) # values from first row from a not cancelled booking

# Scale customer data using the same scaler
scaled_customerdata = scaler.transform(customerdata.reshape(1, -1))

# Predict whether the customer will cancel the booking using KNN model
prediction = cv_classifier.predict(scaled_customerdata)
print("Prediction:", prediction)

Prediction: [1]




In [126]:
# Function to interpret prediction result
def cancelation(scaled_customerdata):
    result = ""
    predictions = cv_classifier.predict(scaled_customerdata)
    if predictions == 1:
        result = "won't cancel the booking"
    elif predictions == 0:
       result = "will cancel the booking"
    else: print("error in prediction")
    return result

In [127]:
print("This costumer {} according to the K Nearest Neighbors model".format(cancelation(scaled_customerdata)))

This costumer won't cancel the booking according to the K Nearest Neighbors model
