In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform


In [None]:
df = pd.read_csv('hotel_bookings.csv')

In [None]:
def impute_median(series):
    return series.fillna(series.median())

In [None]:
#Fill children column with the median of the children
df.children = df['children'].transform(impute_median)

In [None]:
# Remove rows with NaN in the 'country' column from the original DataFrame
df.dropna(subset=['country'], inplace=True)

# Reset the index after removing rows
df.reset_index(drop=True, inplace=True)

In [None]:
df['agent_encoded'] = df['agent'].isnull().astype(int)

In [None]:
df['company_encoded'] = df['company'].isnull().astype(int)

In [None]:
# Define the columns you want to select
columns_to_select = ['lead_time', 'country', 'deposit_type', 'market_segment', 'assigned_room_type', 'distribution_channel', 'customer_type', 'agent_encoded', 'company_encoded', 'arrival_date_week_number']

# Create the new DataFrame by selecting the desired columns
final_df = df[columns_to_select]


In [None]:
from sklearn.preprocessing import OneHotEncoder
# Perform one-hot encoding
transformed_df = final_df[['lead_time', 'agent_encoded', 'company_encoded']].copy()
transformed_df['arrival_date_week_number'] = df['arrival_date_week_number']
attributes_to_encode = ['country', 'deposit_type', 'market_segment', 'assigned_room_type', 'distribution_channel', 'customer_type']
for attribute in attributes_to_encode:
    onehot_encoder = OneHotEncoder(sparse=False)
    onehot_encoded = onehot_encoder.fit_transform(final_df[[attribute]])
    onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out([attribute]))

    # Concatenate the one-hot encoded features with the original dataframe
    transformed_df = pd.concat([transformed_df, onehot_encoded_df], axis=1)
    print(transformed_df.shape)

scaler = StandardScaler()
scaled_data = scaler.fit_transform(transformed_df[['lead_time', 'arrival_date_week_number']])
transformed_df[['lead_time', 'arrival_date_week_number']] = scaled_data



(118902, 181)
(118902, 184)




(118902, 192)
(118902, 204)
(118902, 209)




(118902, 213)


# **Split data into Train-Test Sets**

In [None]:
X = transformed_df
y = df['is_canceled']
print(X.shape, y.shape)

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

(118902, 213) (118902,)


# **Models**

# **6. Support Vector Machine (SVM)**

Build, train and test SVM model.
Based on our EDA results, we believe that our attributes have a complex relationship with each other, but we are unsure of what pattern it takes shape. Hence we will be using the Radial Basis Function(RBF) kernal as the baseline.

In [None]:
# Create an SVM model with a radial basis function (RBF) kernel
svm_model = SVC(kernel='rbf')
svm_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm_model.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the performance metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}\n')

# Print the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Due to the large dataset, 118902 x 213, we forsee that tuning all the parameters simultatenously through RandomizedSearchCV or GridSearchCV may be too computationally intensive and time-consuming. Hence, we will be tuning the parameters manually in stages. We will first proceed to find the best kernal.


In [None]:
kernels = ['linear', 'poly', 'sigmoid']

for kernel in kernels:
    print(f"Evaluating Kernel: {kernel}\n")
    svm_model = SVC(kernel=kernel)
    svm_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = svm_model.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print the performance metrics
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}\n')

    # Print the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Print the classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

As we can see, the Linear kernal has the highest precision, but the RBF has the highest f1 score. Thus, we will be futher tuning these 2 kernals.

The Linear kernal is primarily only affected by the C parameter, C is the regularization parameter that controls the trade-off between achieving a low training error and a low testing error.

In [None]:
# Define a list of C values to test
c_values = [0.1, 1, 10, 100]

# Iterate over different C values
best_precision = 0
best_params = {}

# Iterate over different C values
for c in c_values:
    # Create an SVM model with the linear kernel and the current C value
    svm_model = SVC(kernel='linear', C=c)
    svm_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = svm_model.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Print the performance metrics for the current C value
    print(f'Performance metrics for C={c}:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}\n')

    # Track the best precision score and corresponding parameters
    if precision > best_precision:
        best_precision = precision
        best_params = {'C': c}

    # Print the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

    # Print the classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

# Print the best parameters found for the best precision score
print("Best parameters found for the best precision score:")
print(best_params)


The RBF kernal is affected by both C and Gamma parameter, Gamma is for non-linear hyperplanes, and defines how far the influence of a single training exmple reached.

In [None]:
# Define lists of C and gamma values to test
c_values = [0.1, 1, 10]
gamma_values = [0.1, 1, 10, 100]

best_precision = 0
best_params = {}

# Iterate through different C and gamma values
for c in c_values:
    for gamma in gamma_values:
        # Create an SVM model with the RBF kernel and the current C and gamma values
        svm_model = SVC(kernel='rbf', C=c, gamma=gamma)
        svm_model.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = svm_model.predict(X_test)

        # Calculate performance metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Print the performance metrics for the current C and gamma values
        print(f'Performance metrics for C={c} and gamma={gamma}:')
        print(f'Accuracy: {accuracy}')
        print(f'Precision: {precision}')
        print(f'Recall: {recall}')
        print(f'F1 Score: {f1}\n')

        # Track the best precision score and corresponding parameters
        if precision > best_precision:
            best_precision = precision
            best_params = {'C': c, 'gamma': gamma}

        # Print the confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print("Confusion Matrix:")
        print(cm)

        # Print the classification report
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        print("\n")

# Print the best parameters found for the best precision score
print("Best parameters found for the best precision score:")
print(best_params)


As we can see, the parameters that provided the best precision score is C = 0.1 and Gamma = 100 with the RBF kernal.

Our initial range of C and Gamma was 0.1 to 1000 with increments of magnitude 10. However, we saw that the time taken for the model to run took exponentially longer. Also, we saw that the precision score for C = 10 was higher than that of C = 100. We concluded that C = 100 was too large and was already causing overfitting, thus we decided to remove C = 1000 as it was unlikely to return us better results. We had similar results for Gamma as well.


Below is the initial standardised approach to hypertuning SVM. But we noted that it was too computationally intensive and time-consuming and we ended up switching to tuning the parameters in 2 stages. Firstly finding the best kernal, followed by the best parameters for those kernals.

In [None]:
param_dist = {'C': reciprocal(0.1, 100), 'gamma': reciprocal(0.1, 100),
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'degree': [2, 3, 4],
              'coef0': uniform(0, 5)}

# Create an SVM model
svm_model = SVC()
random_search = RandomizedSearchCV(estimator=svm_model, param_distributions=param_dist,
                                   n_iter=20, cv=10, scoring='precision', random_state=0, n_jobs=-1, verbose=2)

random_search.fit(X, y)

print("Best parameters found: ", random_search.best_params_)
print("Best precision score: ", random_search.best_score_)


However, we also do note that although this is better than manually iterating through the parameters in stages, it still may not provide us with the best parameters. This is because the RandomizedSearchCV only tries out a random combination of parameters, and may not necessarily try out the best combination of parameters. Hence, we using GridSearchCV may be a better option, but it will be too computationally intensive and time-consuming. The use of k-fol for corss validation may also help with getting a better model, but this too will add to the computational intensity and time taken.