In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

In [2]:
# Upload the Kaggle API token
from google.colab import files
files.upload()  # Upload kaggle.json

# Setup Kaggle API
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download dataset


!kaggle datasets download -d sukhmansaran/british-airways-datasets
# Unzip dataset
!unzip chest-xray-pneumonia.zip -d ./data


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/sukhmansaran/british-airways-datasets
License(s): CC-BY-NC-SA-4.0
Downloading british-airways-datasets.zip to /content
  0% 0.00/414k [00:00<?, ?B/s]
100% 414k/414k [00:00<00:00, 777MB/s]
unzip:  cannot find or open chest-xray-pneumonia.zip, chest-xray-pneumonia.zip.zip or chest-xray-pneumonia.zip.ZIP.


In [3]:
!unzip /content/british-airways-datasets.zip


Archive:  /content/british-airways-datasets.zip
  inflating: customer_booking.csv    


In [4]:
df = pd.read_csv('/content/customer_booking.csv', encoding='latin1')
df.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [5]:
df.shape

(50000, 14)

## Handling Missing values

In [6]:
print("checking missing values: ")
print(df.isnull().sum())

checking missing values: 
num_passengers           0
sales_channel            0
trip_type                0
purchase_lead            0
length_of_stay           0
flight_hour              0
flight_day               0
route                    0
booking_origin           0
wants_extra_baggage      0
wants_preferred_seat     0
wants_in_flight_meals    0
flight_duration          0
booking_complete         0
dtype: int64


In [7]:
df.dtypes

Unnamed: 0,0
num_passengers,int64
sales_channel,object
trip_type,object
purchase_lead,int64
length_of_stay,int64
flight_hour,int64
flight_day,object
route,object
booking_origin,object
wants_extra_baggage,int64


In [8]:
# delete some columns
df = df.drop (['sales_channel', 'trip_type' ,'route'], axis = 1)

In [9]:
df.shape

(50000, 11)

In [10]:
df.head()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,flight_day,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,262,19,7,Sat,New Zealand,1,0,0,5.52,0
1,1,112,20,3,Sat,New Zealand,0,0,0,5.52,0
2,2,243,22,17,Wed,India,1,1,0,5.52,0
3,1,96,31,4,Sat,New Zealand,0,0,1,5.52,0
4,2,68,22,15,Wed,India,1,0,1,5.52,0


## Label Encoding (better for Decision Trees):

In [11]:
from sklearn.preprocessing import LabelEncoder
le_day = LabelEncoder()
le_origin = LabelEncoder()

df['flight_day'] = le_day.fit_transform(df['flight_day'])
df['booking_origin'] = le_origin.fit_transform(df['booking_origin'])

In [12]:
df.head()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,flight_day,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,262,19,7,2,61,1,0,0,5.52,0
1,1,112,20,3,2,61,0,0,0,5.52,0
2,2,243,22,17,6,36,1,1,0,5.52,0
3,1,96,31,4,2,61,0,0,1,5.52,0
4,2,68,22,15,6,36,1,0,1,5.52,0


## Seperate Features and Target variable

In [13]:
X = df.drop('booking_complete', axis=1)
y = df['booking_complete']

# Apply Standard scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Training  a Decision Tree Classifier
print("\n Training Decision Tree Classifier")
model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, y_train)


 Training Decision Tree Classifier


## Model Evaluation (predict on test data)

In [14]:
y_pred = model.predict(X_test)

## Calculate Accuracy

In [15]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy: .2f}")

Accuracy:  0.78


## Classification report

In [16]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87     12784
           1       0.27      0.30      0.28      2216

    accuracy                           0.78     15000
   macro avg       0.57      0.58      0.57     15000
weighted avg       0.79      0.78      0.78     15000



## Confusion Matrix

In [17]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[10969  1815]
 [ 1557   659]]
