In [52]:
import pandas as pd
import pprint

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [25]:
# importing the csv file
cust_booking = pd.read_csv("C:/Users/snkri/OneDrive/Desktop/virtual_interns/task2/customer_booking.csv", encoding="ISO-8859-1")

In [26]:
cust_booking.head()

Unnamed: 0,num_passengers,sales_channel,trip_type,purchase_lead,length_of_stay,flight_hour,flight_day,route,booking_origin,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete
0,2,Internet,RoundTrip,262,19,7,Sat,AKLDEL,New Zealand,1,0,0,5.52,0
1,1,Internet,RoundTrip,112,20,3,Sat,AKLDEL,New Zealand,0,0,0,5.52,0
2,2,Internet,RoundTrip,243,22,17,Wed,AKLDEL,India,1,1,0,5.52,0
3,1,Internet,RoundTrip,96,31,4,Sat,AKLDEL,New Zealand,0,0,1,5.52,0
4,2,Internet,RoundTrip,68,22,15,Wed,AKLDEL,India,1,0,1,5.52,0


In [27]:
cust_booking.nunique()

num_passengers             9
sales_channel              2
trip_type                  3
purchase_lead            470
length_of_stay           335
flight_hour               24
flight_day                 7
route                    799
booking_origin           104
wants_extra_baggage        2
wants_preferred_seat       2
wants_in_flight_meals      2
flight_duration           21
booking_complete           2
dtype: int64

In [28]:
cust_booking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   sales_channel          50000 non-null  object 
 2   trip_type              50000 non-null  object 
 3   purchase_lead          50000 non-null  int64  
 4   length_of_stay         50000 non-null  int64  
 5   flight_hour            50000 non-null  int64  
 6   flight_day             50000 non-null  object 
 7   route                  50000 non-null  object 
 8   booking_origin         50000 non-null  object 
 9   wants_extra_baggage    50000 non-null  int64  
 10  wants_preferred_seat   50000 non-null  int64  
 11  wants_in_flight_meals  50000 non-null  int64  
 12  flight_duration        50000 non-null  float64
 13  booking_complete       50000 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.3+ 

In [29]:
# Remove route, booking_origin
# convert categorical data to onehotencoded data

In [30]:
# making a copy dataframe
data_df = cust_booking

# dropping the unnecessary columns
data_df.drop(['route', 'booking_origin'], inplace=True, axis=1)

In [31]:
# defining a onehot encoder
oh_enc = OneHotEncoder()

In [32]:
# one hot encoding the categorical columns
enc_cols = pd.DataFrame(oh_enc.fit_transform(data_df[['sales_channel', 'trip_type', 'flight_day']]).toarray())

In [33]:
# replacing the categorical columns with encoded columns
data_df = data_df.join(enc_cols)
data_df.drop(['sales_channel', 'trip_type', 'flight_day'], inplace = True, axis = 1)

In [34]:
data_df.head()

Unnamed: 0,num_passengers,purchase_lead,length_of_stay,flight_hour,wants_extra_baggage,wants_preferred_seat,wants_in_flight_meals,flight_duration,booking_complete,0,...,2,3,4,5,6,7,8,9,10,11
0,2,262,19,7,1,0,0,5.52,0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,112,20,3,0,0,0,5.52,0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,243,22,17,1,1,0,5.52,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,96,31,4,0,0,1,5.52,0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,2,68,22,15,1,0,1,5.52,0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [35]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   num_passengers         50000 non-null  int64  
 1   purchase_lead          50000 non-null  int64  
 2   length_of_stay         50000 non-null  int64  
 3   flight_hour            50000 non-null  int64  
 4   wants_extra_baggage    50000 non-null  int64  
 5   wants_preferred_seat   50000 non-null  int64  
 6   wants_in_flight_meals  50000 non-null  int64  
 7   flight_duration        50000 non-null  float64
 8   booking_complete       50000 non-null  int64  
 9   0                      50000 non-null  float64
 10  1                      50000 non-null  float64
 11  2                      50000 non-null  float64
 12  3                      50000 non-null  float64
 13  4                      50000 non-null  float64
 14  5                      50000 non-null  float64
 15  6 

In [54]:
# splitting the data_df into X and y
y = data_df[['booking_complete']]
X = data_df.drop(['booking_complete'], axis=1)

# renaming the columns
X = X.rename(str, axis = 'columns')
y = y.rename(str, axis = 'columns') 


In [55]:
# splitting the train data and validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.3)

In [56]:
# defining the model
clf = RandomForestClassifier(n_estimators = 100)

# fitting the model
clf.fit(X_train, y_train)

# predicting on the validation data
y_pred = clf.predict(X_val)

# accuracy of the model
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(y_val, y_pred))

  return fit_method(estimator, *args, **kwargs)


ACCURACY OF THE MODEL:  0.8460666666666666


In [5]:
cust_booking['booking_complete'] = cust_booking['booking_complete'].astype('')