In [25]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

print("✅ Python & Libraries are ready!")


✅ Python & Libraries are ready!


In [26]:
print("--- Step 1: Load and Explore Data ---")

import pandas as pd

try:
    df = pd.read_csv("customer_booking.csv", encoding="latin1")
    print("✅ File loaded successfully!")
except FileNotFoundError:
    print("❌ ERROR: 'customer_booking.csv' not found. Make sure the file is in the same folder as this notebook.")

print(f"\nRows: {df.shape[0]}, Columns: {df.shape[1]}\n")
df.head()


--- Step 1: Load and Explore Data ---
✅ File loaded successfully!

Rows: 50000, Columns: 14



Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [27]:
print("\n--- Step 2: Target Distribution (booking_complete) ---")
print(df['booking_complete'].value_counts(normalize=True) * 100)




--- Step 2: Target Distribution (booking_complete) ---
booking_complete
0    85.044
1    14.956
Name: proportion, dtype: float64


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

print("\n--- Step 3: Preprocessing and Split ---")

# Identify features and target
X = df.drop('booking_complete', axis=1)
y = df['booking_complete']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

# Model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create pipeline
clf = Pipeline(steps=[('preprocessing', preprocessor),
                     ('model', model)])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ Data prepared and split successfully!")



--- Step 3: Preprocessing and Split ---
✅ Data prepared and split successfully!


In [29]:
print("\n--- Step 4: Model Training ---")
clf.fit(X_train, y_train)
print("✅ Training Complete!")



--- Step 4: Model Training ---
✅ Training Complete!


In [30]:
print("\n--- Step 5: Evaluation ---")
y_pred = clf.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))



--- Step 5: Evaluation ---
Classification Report:

              precision    recall  f1-score   support

           0       0.87      0.98      0.92      8520
           1       0.54      0.13      0.21      1480

    accuracy                           0.86     10000
   macro avg       0.71      0.56      0.56     10000
weighted avg       0.82      0.86      0.81     10000


Confusion Matrix:

[[8360  160]
 [1289  191]]
