In [None]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing neccesary libraries.

import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_columns', 500)

In [None]:
#Provide the full path where the csv file is located
df= pd.read_csv("hotel_bookings.csv") 
df.head()

In [None]:
print(df.shape)

In [None]:
# To describe the data
df.describe()

In [None]:
#Datatypes used
df.info()

In [None]:
# Enlarging the pie chart
plt.rcParams['figure.figsize'] = 8,8

# Indexing labels. tolist() will convert the index to list for easy manipulation
labels = df['hotel'].value_counts().index.tolist()

# Convert value counts to list
sizes = df['hotel'].value_counts().tolist()

# As the name suggest, explode will determine how much each section is separated from each other 
explode = (0, 0.1)

# Determine colour of pie chart
colors = ['blue', 'red']

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',startangle=90, textprops={'fontsize': 14})
plt.show()


In [None]:
#Checking the missing values in the columns

df.isnull().sum()

In [None]:
#Column-wise null percentage

print(round(100*(df.isnull().sum()/len(df.index)),2))

In [None]:
df=df.drop(['agent','company'],axis=1)

In [None]:
df = df.dropna(axis = 0)

In [None]:
df.isnull().sum()

In [None]:
df.columns = ['Hotel', 'Canceled', 'LeadTime', 'ArrivingYear', 'ArrivingMonth', 'ArrivingWeek','ArrivingDate', 'WeekendStay',
              'WeekStay', 'Adults', 'Children', 'Babies', 'Meal','Country', 'Segment', 'DistChannel','RepeatGuest', 'PrevCancel',
              'PrevBook', 'BookRoomType','AssignRoomType', 'ChangeBooking', 'DepositType', 'WaitingDays', 
              'CustomerType', 'ADR','ParkSpace', 'SpecialRequest','Reservation', 'ReservationDate']

In [None]:
def var(df):
    unique_list = pd.DataFrame([[i,len(df[i].unique())] for i in df.columns])
    unique_list.columns = ['name','uniques']

    total_var = set(df.columns)
    cat_var = set(unique_list.name[(unique_list.uniques<=12)      | 
                                   (unique_list.name=='Country')  | 
                                   (unique_list.name=='Agent')                                     
                                  ])
    con_var = total_var - cat_var
    
    return cat_var, con_var 


cat_var, con_var = var(df)

print("Continuous Variables (",len(con_var),")\n",con_var,'\n\n'
      "Categorical Variables(",len(cat_var),")\n",cat_var)

In [None]:
# Creating a boxplot for Outlier detection
features = ['LeadTime', 'WeekendStay', 'WeekStay', 'Adults','Children', 'Babies','ADR']
n = 1
plt.figure(figsize=(16,18))
for feature in features:
    plt.subplot(4,4,n)
    sns.boxplot(df[features])
    n+=1
    plt.tight_layout()

In [None]:
#Checking outliers in continuous variables
df[con_var].describe()

In [None]:
df.loc[df.LeadTime      > 500,'LeadTime'     ]=500
df.loc[df.WaitingDays   >   0,'WaitingDays'  ]=  1
df.loc[df.WeekendStay   >=  5,'WeekendStay'  ]=  5
df.loc[df.Adults        >   4,'Adults'       ]=  4
df.loc[df.PrevBook      >   0,'PrevBook'     ]=  1
df.loc[df.PrevCancel    >   0,'PrevCancel'   ]=  1
df.loc[df.WeekStay      >  10,'WeekStay'     ]= 10
df.loc[df.ChangeBooking >   5,'ChangeBooking']=  5

cat_var = set(list(cat_var) + ['PrevBook','PrevCancel'])
con_var = set(df.columns) - cat_var

df[con_var].describe()

In [None]:
#outliers in categorical variables
df[cat_var].describe()

In [None]:
df.loc[df.Babies    > 8,'Babies']    = 0
df.loc[df.ParkSpace > 5,'ParkSpace'] = 0
df.loc[df.Children  > 8,'Children']  = 0

df[cat_var].describe()

In [None]:
#Lets combine children and babies together as kids
df['Kids'] = df.Children + df.Babies

#Combine total mumbers by adding kids and adults
df['total_members'] = df.Kids + df.Adults

In [None]:
#convert the datatypes to string
df['ArrivingYear'] = df['ArrivingYear'].astype('str')
df['ArrivingMonth'] = df['ArrivingMonth'].astype('str')
df['ArrivingDate'] = df['ArrivingDate'].astype('str')

df['Canceled'] = df['Canceled'].astype('str')
df['RepeatGuest'] = df['RepeatGuest'].astype('str')


In [None]:
# Lets convert arrival date to datetime
df['Arrival Date'] = df['ArrivingDate'] + '-' + df['ArrivingMonth'] + '-' + df['ArrivingYear']
df['Arrival Date'] = pd.to_datetime(df['Arrival Date'], errors='coerce')

In [None]:
confirmed_bookings = df[df.Canceled=='0']

In [None]:
import datetime as dt
confirmed_bookings['ArrivingMonth'] = df['Arrival Date'].dt.month
final=confirmed_bookings['ArrivingMonth'].value_counts().sort_index()
final

In [None]:
print('Total Bookings canceled')
print('-'*50)
print(df.Canceled.value_counts())
print('-'*50)
print('*'*75)
print('Cancelation percentage in both hotels ')
print('-'*50)
print(df.Canceled.value_counts(normalize=True))


In [None]:
df.Country.value_counts(normalize=True)

In [None]:
df.ArrivingMonth.value_counts(normalize=True)

In [None]:
df.Segment.value_counts(normalize=True)

In [None]:
df.ArrivingYear.value_counts(normalize=True)

In [None]:
df.Meal.value_counts(normalize=True)

In [None]:
df.CustomerType.value_counts(normalize=True)

In [None]:
df.Reservation.value_counts(normalize=True)

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x='ArrivingYear', y='LeadTime',hue='Canceled', data= df, palette='vlag')
plt.title('Arriving year, Leadtime and Cancelations')

In [None]:
#Lets see the correlation
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=True,cmap='RdYlGn')

In [None]:
#Canceled=1, Not canceled= 0
canceled_data = df['Canceled']
sns.countplot(canceled_data, palette='husl')

plt.show()

In [None]:
cols = ['gold', 'lightcoral']
df['Canceled'].value_counts().plot.pie(autopct='%1.1f%%',shadow=True, colors=cols)

In [None]:
plt.figure(figsize=(12, 6))

sns.countplot(x='Hotel',hue="Canceled", data=df,palette='Pastel1')
plt.title("Cancelation rates in City hotel and Resort hotel",fontweight="bold", size=20)
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
sns.countplot(x='ArrivingYear',hue='Hotel', data=df,palette='husl')
plt.title("Arrivals per year in Both hotels ",fontweight="bold", size=30)
plt.subplot(1, 2, 2)
sns.countplot(data = df, x = 'ArrivingMonth')
plt.title('Arrivals per month',fontweight="bold", size=30)
plt.subplots_adjust(right=1.7)

plt.show()

In [None]:
plt.figure(figsize=(15,6))

sns.countplot(data = df, x = 'ArrivingDate', hue='Hotel', palette='Paired')
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
sns.countplot(x='WeekendStay',hue='Hotel', data=df, palette='cool')
plt.title("Number of stays on weekend nights",fontweight="bold", size=20)
plt.subplot(1, 2, 2)
sns.countplot(data = df, x = 'WeekendStay', hue='Canceled', palette='rocket')
plt.title('WeekendStay vs Cancelation',fontweight="bold", size=20)
plt.subplots_adjust(right=1.7)

plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
sns.countplot(x='WeekStay',hue='Hotel', data=df, palette='rainbow_r')
plt.title("Number of stays on weekday nights",fontweight="bold", size=20)
plt.subplot(1, 2, 2)
sns.countplot(data = df, x = 'WeekStay', hue='Canceled', palette='magma_r')
plt.title('WeekStay vs Cancelations',fontweight="bold", size=20)
plt.subplots_adjust(right=1.7)


plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
sns.countplot(x='Adults',hue='Hotel', data=df, palette='pastel')
plt.title("Number of adults in both hotels",fontweight="bold", size=20)
plt.subplot(1, 2, 2)
sns.countplot(data = df, x = 'Adults', hue='Canceled', palette='husl')
plt.title('Adults vs Cancelations',fontweight="bold", size=20)
plt.subplots_adjust(right=1.7)


plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
sns.countplot(x='Children',hue='Hotel', data=df, palette='cool')
plt.title("Number of Children in both hotels",fontweight="bold", size=20)
plt.subplot(1, 2, 2)
sns.countplot(data = df, x = 'Children', hue='Canceled', palette='Set2')
plt.title('Children vs Cancelations',fontweight="bold", size=20)
plt.subplots_adjust(right=1.7)


plt.show()

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
sns.countplot(x='Babies',hue='Hotel', data=df, palette='coolwarm')
plt.title("Number of Babies in both hotels",fontweight="bold", size=20)

plt.subplot(1, 2, 2)
sns.countplot(data = df, x = 'Babies', hue='Canceled', palette='Set1_r')
plt.title('Babies vs Cancelations',fontweight="bold", size=20)
plt.subplots_adjust(right=1.7)



plt.show()

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.colors import n_colors
from plotly.subplots import make_subplots
# Minmax scaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
country_visitors = df[df['Canceled'] == '0'].groupby(['Country']).size().reset_index(name = 'count')

import plotly.express as px

px.choropleth(country_visitors,
                    locations = "Country",
                    color= "count" ,
                    hover_name= "Country", # column to add to hover information
                    color_continuous_scale="Viridis",
                    title="Home country of visitors")

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(1, 2, 1)
sns.countplot(x='Segment', data=df, palette='rocket')
plt.title('Types of market segment',fontweight="bold", size=20)

plt.subplot(1, 2, 2)
sns.countplot(data = df, x = 'DistChannel',  palette='Set1_r')
plt.title('Types of distribution channels',fontweight="bold", size=20)
plt.subplots_adjust(right=1.7)



plt.show()

In [None]:
plt.figure(figsize=(12, 6))

sns.countplot(data = df, x = 'DepositType',hue='Hotel', palette='cool')
plt.title('Types of Deposit type',fontweight="bold", size=20)


plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(data = df, x = 'RepeatGuest').set_title('Graph showing whether guest is repeated guest', fontsize = 20)
plt.show()

In [None]:
# Resizing plot 
plt.figure(figsize=(12,5))

# Calculating average daily rate per person
df['adr_pp'] = df['ADR'] / (df['Adults'] + df['Children']) 
actual_guests = df.loc[df["Canceled"] == '0']
actual_guests['price'] = actual_guests['ADR'] * (actual_guests['WeekendStay'] + actual_guests['WeekStay'])
sns.lineplot(data = actual_guests, x = 'ArrivingMonth', y = 'price', hue = 'Hotel')
plt.show()

In [None]:
plt.figure(figsize=(12,6))
sns.lineplot(x='ArrivingMonth', y='ADR', hue='Hotel', data= df)
plt.show()