In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv(r"C:\Users\noorh\final1_preprocessed_dataset.csv")


In [8]:
X = df.drop(columns=['booking status_code'])
y = df['booking status_code'].astype(int)

In [9]:
features = X.columns.tolist()
print("Features used in model training:")
for i, col in enumerate(features, 1):
    print(f"{i}. {col}")

Features used in model training:
1. number of adults
2. number of children
3. number of weekend nights
4. number of week nights
5. car parking space
6. lead time
7. average price
8. special requests
9. repeat_type
10. type of meal_code
11. room type_code
12. market segment type_code


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [11]:
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
}

In [12]:
results = {}


In [13]:

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    results[name] = {
        "Accuracy": report["accuracy"],
        "Precision (Not Cancelled)": report["0"]["precision"],
        "Recall (Not Cancelled)": report["0"]["recall"],
        "F1-score (Not Cancelled)": report["0"]["f1-score"]
    }


In [14]:
# Display results
for name, scores in results.items():
    print(f"\n=== {name} ===")
    for metric, value in scores.items():
        print(f"{metric}: {value:.4f}")


=== Logistic Regression ===
Accuracy: 0.7734
Precision (Not Cancelled): 0.8669
Recall (Not Cancelled): 0.7800
F1-score (Not Cancelled): 0.8211

=== Decision Tree ===
Accuracy: 0.8457
Precision (Not Cancelled): 0.8903
Recall (Not Cancelled): 0.8767
F1-score (Not Cancelled): 0.8834

=== Random Forest ===
Accuracy: 0.8768
Precision (Not Cancelled): 0.8951
Recall (Not Cancelled): 0.9235
F1-score (Not Cancelled): 0.9091


In [23]:
import joblib

# Save trained random forest model
joblib.dump(models["Random Forest"], "booking_model.pkl")

# Save feature list
joblib.dump(X.columns.tolist(), "feature_columns.pkl")

['feature_columns.pkl']