# Data Science Challenge

In [1]:
# If additional packages are needed but are not installed by default, uncomment the last two lines of this cell
# and replace <package list> with a list of additional packages.
# This will ensure the notebook has all the dependencies and works everywhere

#import sys
#!{sys.executable} -m pip install <package list>

In [2]:
# Libraries
import pandas as pd
pd.set_option("display.max_columns", 101)

## Data Description

Column | Description
:---|:---
`id` | Unique identifier for each booking.
`lead_time` | Time between booking date and reservation date (in days)
`arrival_week` | Week number of the arrival date.
`duration` | Booking duration (in Days)
`prev_cancel` | Number of previous bookings that were cancelled by the customer prior to the current booking.
`booking_changes` | Number of changes/amendments made to the booking from the moment the booking was entered on the PMS until the moment of check-in or cancellation. 
`waiting_period` | Waiting period for booking confirmation (in Days)	
`per_Day_price` | Per night booking price (in US $).
`parking` | Number of car parking spaces required by the customer.
`special_request` | Number of special requests made by the customer.
`segment` | Market segment designation. In categories, “TA” means “Travel Agents” and “TO” means “Tour Operators”.
`deposit` | Whether the customer made a deposit to guarantee the booking.
`cust_type` | Type of booking, assuming one of four categories.
`is_cancelled` |Value indicating if the booking was cancelled (1) or not (0).

## Data Wrangling & Visualization

In [47]:
# The dataset is already loaded below
data = pd.read_csv("train.csv")

In [48]:
data.head()

Unnamed: 0,id,lead_time,arrival_week,duration,prev_cancel,booking_changes,waiting_period,per_Day_price,parking,special_request,segment,deposit,cust_type,is_cancelled
0,200,245,13,5,0,0,0,78.26,0,2,Online TA,No Deposit,Transient,0
1,201,170,25,2,0,0,0,130.5,0,0,Online TA,No Deposit,Transient,0
2,202,304,45,3,0,0,0,89.0,0,0,Offline TA/TO,Non Refund,Transient,1
3,203,19,7,1,0,0,0,82.13,0,2,Online TA,No Deposit,Transient,0
4,204,214,27,4,0,0,0,89.1,0,1,Direct,No Deposit,Transient-Party,0


In [49]:
#Explore columns
data.columns

Index(['id', 'lead_time', 'arrival_week', 'duration', 'prev_cancel',
       'booking_changes', 'waiting_period', 'per_Day_price', 'parking',
       'special_request', 'segment', 'deposit', 'cust_type', 'is_cancelled'],
      dtype='object')

In [50]:
#Description
data.describe()

Unnamed: 0,id,lead_time,arrival_week,duration,prev_cancel,booking_changes,waiting_period,per_Day_price,parking,special_request,is_cancelled
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,1699.5,110.867,27.215333,2.187333,0.084667,0.189,3.325,104.660693,0.022333,0.547667,0.409667
std,866.169729,112.8239,13.414008,1.449692,0.392704,0.569257,20.426611,38.304952,0.14779,0.786512,0.491854
min,200.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,949.75,23.0,17.0,1.0,0.0,0.0,0.0,79.0,0.0,0.0,0.0
50%,1699.5,75.0,28.0,2.0,0.0,0.0,0.0,99.0,0.0,0.0,0.0
75%,2449.25,166.0,38.0,3.0,0.0,0.0,0.0,125.0,0.0,1.0,1.0
max,3199.0,629.0,53.0,18.0,11.0,13.0,391.0,313.67,1.0,5.0,1.0


## Visualization, Modeling, Machine Learning

Build a classification model and to determine whether a customer will cancel a booking. Please explain the findings effectively to technical and non-technical audiences using comments and visualizations, if appropriate.
- **Build an optimized model that effectively solves the business problem.**
- **The model's performance will be evaluated on the basis of accuracy.**
- **Read the test.csv file and prepare features for testing.**

In [51]:
#Loading Test data
test_data=pd.read_csv('test.csv')
test_data.head()

Unnamed: 0,id,lead_time,arrival_week,duration,prev_cancel,booking_changes,waiting_period,per_Day_price,parking,special_request,segment,deposit,cust_type
0,3200,45,9,3,0,2,0,80.3,0,1,Online TA,No Deposit,Transient
1,3201,7,3,5,0,0,0,73.9,0,0,Online TA,No Deposit,Transient
2,3202,0,32,2,0,1,0,80.0,1,1,Online TA,No Deposit,Contract
3,3203,101,32,1,0,0,0,107.1,0,0,Online TA,No Deposit,Transient
4,3204,95,6,2,0,0,0,72.0,0,0,Offline TA/TO,No Deposit,Transient


In [70]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in data.columns:
    if(type(data[col][0]) is str):
        data[col]= le.fit_transform(data[col])
        test_data[col]= le.fit_transform(test_data[col])


In [73]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X= data.drop(labels = "is_cancelled", axis=1)
y= data["is_cancelled"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [None]:
rfc = RandomForestClassifier()
history = rfc.fit(X_train, y_train)

pred_rfc = rfc.predict(X_test)

accuary_rfc = accuracy_score(y_test, pred_rfc)

print("accuracy score :", accuary_rfc)

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('Model Accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','validation'], loc='upper left')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

params = {'colsample_bytree':[0.4,0.5,0.6],
          'learning_rate':[0.01,0.02,0.09],
          'max_depth':[2,3,4,5,6],
          'n_estimators':[100,200,500,2000]}

xgb = XGBClassifier()

grid = GridSearchCV(xgb,params,cv=10,n_jobs=-1,verbose=2)
xgb_params = grid.fit(X_train,y_train)

In [None]:
xgb_params


**Describe the the most important features of the model to management.**

> #### Task:
- **Visualize the top 10 features and their feature importance.**


> #### Task:
- **Submit the predictions on the test dataset using your optimized model** <br/>
    For each record in the test set (`test.csv`), you must predict whether a customer will cancel his booking or not. You should submit a CSV file with a header row and one row per test entry. 

The file (`submissions.csv`) should have exactly 2 columns:
   - **id**
   - **is_cancelled**
   

In [68]:
test_pred_rfc = rfc.predict(test_data)

zipped = zip(test_data['id'],test_pred_rfc)
listed = list(zipped)
print(listed[:5])



[(3200, 0), (3201, 0), (3202, 0), (3203, 1), (3204, 0)]


In [69]:
submissions_df = pd.DataFrame(listed, columns = ['id','is_cancelled'])
print(submissions_df[:5])

submissions_df.to_csv('submissions.csv')

     id  is_cancelled
0  3200             0
1  3201             0
2  3202             0
3  3203             1
4  3204             0
