In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot
%matplotlib inline

In [None]:


dataset = pd.read_csv('https://raw.githubusercontent.com/Sid-darthvader/DoWhy-The-Causal-Story-Behind-Hotel-Booking-Cancellations/master/hotel_bookings.csv')


In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.isnull().mean()

# Data Cleaning

# 1)Dropping column
Here in the dataset since almost 95% values of company is missing so we drop that column

In [None]:
dataset=dataset.drop('company',axis=1)

# 2)Random imputation

In [None]:
sns.distplot(dataset['agent'],kde=False,bins=50,hist_kws={ "linewidth": 3,"alpha": 1, "color": "black"},axlabel='agent fees')

In [None]:
dataset['agent']=dataset['agent'].replace(np.nan,0)

In [None]:
sns.distplot(dataset['agent'],kde=False,bins=50,hist_kws={ "linewidth": 3,"alpha": 1, "color": "black"},axlabel='agent fees')


## 3)Dropping null values
Here in the dataset the children and country column have very less missing values so we will drop thos missing values

In [None]:
dataset.dropna(inplace=True)

In [None]:
dataset.to_csv("data1.csv",index=False)

In [None]:
dataset.isnull().sum()

In [None]:
dataset['total_guests']=dataset['adults']+dataset['babies']+dataset['children']

In [None]:
dataset.drop(['adults','babies','children'],axis=1,inplace=True)

In [None]:
dataset['total_days']=dataset['stays_in_week_nights']+dataset['stays_in_weekend_nights']
dataset.drop(['stays_in_week_nights','stays_in_weekend_nights'],axis=1,inplace=True)

In [None]:
dataset['is_canceled']=np.where(dataset['is_canceled']==0,'Booking not canceled','Booking canceled')
dataset['is_repeated_guest']=np.where(dataset['is_repeated_guest']==0,"First time guest","Repeated Guest")
dataset['previous_cancellations']=np.where(dataset['previous_cancellations']==0,"No previous cancellation","Previously Cancelled")

In [None]:
dataset["different_room_assigned"]=np.where(dataset["reserved_room_type"]==dataset["assigned_room_type"],0,1)
dataset.drop(['assigned_room_type',"reserved_room_type"],axis=1,inplace=True)

# EDA

In [None]:
sns.countplot(dataset['hotel'],hue=dataset['is_canceled'],palette='mako')
plt.savefig('Type_of_hotel-is_canceled_1.png')

##### we can see that City hotels have high chance of cancelling than resorts


In [None]:
dataset.info()

In [None]:
sns.countplot(dataset['previous_cancellations'],hue=dataset['is_canceled'],palette="Set1")
plt.savefig("prev_cancellation_is_cancelled_2.png")

##### We can see that when a person has previously cancelled then there is a high chance that they will cancel again

In [None]:
dataset.describe()

In [None]:
sns.countplot(dataset['is_repeated_guest'],hue=dataset['is_canceled'],palette='rocket')
plt.savefig("repeated_guest_is_cancelled_3.png")

##### It is obvious that when a person is a repeated guest he likes the hotel/resort and hence there is less chance of him or her cancelling the booking

In [None]:

sns.distplot(dataset[dataset['total_guests']<10]['total_guests'],kde=False,bins=7,
             hist_kws={"linewidth": 3,"alpha": 1, "color": "black"})
plt.savefig("histogram_total_guests_4.png")

##### The most number of guests that stay are two

In [None]:
sns.countplot(dataset['deposit_type'],hue=dataset['is_canceled'],palette='Set1')
plt.savefig("deposit_type_is_canceled_5.png")

##### This visualisation that wesee is pretty interesting since when the deposit type is non refundable then most of the time the booking is cancelled. As a generral thought a person may think if a booking is non refundable then there is less chance for it to be cancelled

In [None]:
categorical_features = []
numerical_features = []

for col in dataset.columns:
    if(dataset[col].dtype!='object'):
        numerical_features.append(col)
    else:
        categorical_features.append(col)
print(categorical_features)
import seaborn as sns
plt.figure(figsize=(12,7))
sns.heatmap(dataset[numerical_features].corr(),linewidths=2,linecolor='black',annot=True)
plt.savefig("heatmap_6.png")

##### Shows correlation heat map of all the numerical features

In [None]:
not_canceled=dataset[dataset["is_canceled"]=='Booking not canceled']
nc_month=not_canceled.groupby('arrival_date_month').count()

canceled=dataset[dataset["is_canceled"]=='Booking canceled']
c_month=canceled.groupby('arrival_date_month').count()

In [None]:
c_month

In [None]:

f, axes = plt.subplots(1, 2,figsize=(15,8),sharex=True,sharey=True,)
sns.barplot(c_month.index,c_month['is_canceled'],ax=axes[0],color='darkblue')
sns.barplot(nc_month.index,nc_month['is_canceled'],ax=axes[1],color='darkblue')
axes[0].tick_params(axis='x', rotation=45)
axes[1].tick_params(axis='x', rotation=45)
axes[0].set_title("Booking canceled")
axes[1].set_title("Booking not canceled")
plt.savefig("is_cancelled_Acctomonth_7.png")

##### The month of August has the most hotel cancellations andtye month of January has the least

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=dataset,x='arrival_date_year',hue='is_canceled',palette='Set1')
plt.savefig("is_cancelled_year_8.png")

# Model

In [None]:
dataset['is_canceled']=np.where(dataset['is_canceled']=='Booking not canceled',0,1)
dataset['is_repeated_guest']=np.where(dataset['is_repeated_guest']=="First time guest",0,1)
dataset['previous_cancellations']=np.where(dataset['previous_cancellations']=="No previous cancellation",0,1)

In [None]:
dataset['different_room_assigned']= dataset['different_room_assigned'].replace(1,True)
dataset['different_room_assigned']= dataset['different_room_assigned'].replace(0,False)
dataset['is_canceled']= dataset['is_canceled'].replace(1,True)
dataset['is_canceled']= dataset['is_canceled'].replace(0,False)

In [None]:
import dowhy

In [None]:
from graphviz import Source

In [None]:
from IPython.display import Image, display

In [None]:
from dowhy import CausalModel

In [None]:
from causalgraphicalmodels import CausalGraphicalModel
graph = CausalGraphicalModel(
    nodes=[ 'is_canceled', 'lead_time', "unobserved_confounder",'total_of_special_requests',
        'meal', 'country', 'market_segment',
        'is_repeated_guest', 
       'previous_bookings_not_canceled', 'booking_changes', 'previous_cancellation','required_car_parking_spaces',
        'days_in_waiting_list',
         'total_guests',
       'total_days', 'different_room_assigned','agent'],
    edges=[
        ("market_segment", "lead_time"), 
        ("lead_time", "is_canceled"), 
        ("country", "lead_time"),
        ("different_room_assigned", "is_canceled"), 
        ("unobserved_confounder", "is_canceled"),("unobserved_confounder","lead_time"),("unobserved_confounder","different_room_assigned"),
        ("country","meal"),
        ("lead_time",'days_in_waiting_list'),
        ('days_in_waiting_list',"is_canceled"),
        ('previous_bookings_not_canceled','is_canceled'),
        ('previous_bookings_not_canceled','is_repeated_guest'),
        ('is_repeated_guest','is_canceled'),
        ('total_days',"is_canceled"),
        ('total_days',"agent"),
        ('total_guests','is_canceled'),
        ('previous_cancellation','is_canceled'),
        ('previous_cancellation','is_repeated_guest'),
        ('required_car_parking_spaces','is_canceled'),('total_guests','required_car_parking_spaces'),('total_days','required_car_parking_spaces'),
        ('total_of_special_requests','is_canceled'),
        ('booking_changes','different_room_assigned'),('booking_changes','is_canceled')
    ]
)
G=graph.draw()
G

In [None]:
G.render('test-output/causal',format='png') 

In [None]:
causal_graph = """digraph {
different_room_assigned[label="Different Room Assigned"];
is_canceled[label="Booking Cancelled"];
booking_changes[label="Booking Changes"];
previous_bookings_not_canceled[label="Previous Booking Retentions"];
days_in_waiting_list[label="Days in Waitlist"];
lead_time[label="Lead Time"];
market_segment[label="Market Segment"];
country[label="Country"];
U[label="Unobserved Confounders"];
is_repeated_guest;
agent;
total_days;
total_guests;
meal;
hotel;
U->different_room_assigned; U->is_canceled;U->required_car_parking_spaces;
market_segment -> lead_time;
lead_time->is_canceled; country -> lead_time;
different_room_assigned -> is_canceled;
country->meal;
lead_time -> days_in_waiting_list;
days_in_waiting_list ->is_canceled;
previous_bookings_not_canceled -> is_canceled;
previous_bookings_not_canceled -> is_repeated_guest;
is_repeated_guest -> is_canceled;
total_days -> is_canceled;
total_days->agent;
total_guests -> is_canceled;
booking_changes -> different_room_assigned; booking_changes -> is_canceled; 
hotel -> is_canceled;
required_car_parking_spaces -> is_canceled;
total_of_special_requests -> is_canceled;
country->{hotel, required_car_parking_spaces,total_of_special_requests,is_canceled};
market_segment->{hotel, required_car_parking_spaces,total_of_special_requests,is_canceled};
}"""

In [None]:
import statsmodels
model= dowhy.CausalModel(
        data = dataset,
        graph=causal_graph.replace("\n", " "),
        treatment="different_room_assigned",
        outcome='is_canceled')
#Identify the causal effect
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
print(identified_estimand)

In [None]:
dataset['different_room_assigned'].value_counts()

In [None]:

estimate = model.estimate_effect(identified_estimand, 
                                     method_name="backdoor.propensity_score_stratification",target_units="ate")
# ATE = Average Treatment Effect
# ATT = Average Treatment Effect on Treated (i.e. those who were assigned a different room)
# ATC = Average Treatment Effect on Control (i.e. those who were not assigned a different room)
print(estimate)