In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [4]:
df_hotels = pd.read_csv('hotel_bookings.csv')
df_hotels.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [None]:
nan_counts = pd.DataFrame(df_hotels.isnull().sum(), columns=['Count'])
nan_counts = nan_counts[nan_counts['Count'] > 0]  
display(nan_counts)

dtypes_df = pd.DataFrame(df_hotels.dtypes, columns=['Data Type'])
display(dtypes_df)

missing_data = df_hotels[(df_hotels['country'].isnull()) & (df_hotels['agent'].isnull())]
display(missing_data[['hotel', 'is_canceled', 'country', 'agent', 'reservation_status', 'reservation_status_date']].head(10))

# Ada beberapa kolom dengan missing values, yaitu 'country', 'agent', dan 'company'.
# Kolom 'country' memiliki tipe data object (string), sedangkan 'agent' dan 'company' memiliki tipe data float64.
# reservation_status_date bertipe object harusnya datetime
# Kolom agent dan company memiliki banyak missing values, kemungkinan karena tidak semua reservasi dilakukan melalui agen atau perusahaan.

Unnamed: 0,Count
children,4
country,488
agent,16340
company,112593


Unnamed: 0,Data Type
hotel,object
is_canceled,int64
lead_time,int64
arrival_date_year,int64
arrival_date_month,object
arrival_date_week_number,int64
arrival_date_day_of_month,int64
stays_in_weekend_nights,int64
stays_in_week_nights,int64
adults,int64


Unnamed: 0,hotel,is_canceled,country,agent,reservation_status,reservation_status_date
30,Resort Hotel,0,,,Check-Out,2015-07-15
4127,Resort Hotel,1,,,Canceled,2016-02-15
7092,Resort Hotel,1,,,Canceled,2016-07-20
7860,Resort Hotel,1,,,Canceled,2016-07-22
8779,Resort Hotel,1,,,Canceled,2016-10-13
9376,Resort Hotel,1,,,Canceled,2016-11-21
9610,Resort Hotel,1,,,Canceled,2016-12-19
9612,Resort Hotel,1,,,Canceled,2016-12-19
13804,Resort Hotel,0,,,Check-Out,2016-01-22
13805,Resort Hotel,1,,,Canceled,2016-10-10


In [7]:
# Outlier aneh, karena ada nol di kolom adult
adults_zero = df_hotels[df_hotels['adults'] == 0]
display(adults_zero[['hotel', 'adults', 'children', 'babies', 'reservation_status']])

Unnamed: 0,hotel,adults,children,babies,reservation_status
2224,Resort Hotel,0,0.0,0,Check-Out
2409,Resort Hotel,0,0.0,0,Check-Out
3181,Resort Hotel,0,0.0,0,Check-Out
3684,Resort Hotel,0,0.0,0,Check-Out
3708,Resort Hotel,0,0.0,0,Check-Out
...,...,...,...,...,...
117204,City Hotel,0,2.0,0,Check-Out
117274,City Hotel,0,2.0,0,Check-Out
117303,City Hotel,0,2.0,0,Check-Out
117453,City Hotel,0,2.0,0,Check-Out


In [None]:
negative_adr = df_hotels[df_hotels['adr'] < 0]
display(negative_adr[['hotel', 'adr', 'reservation_status']])

# ADR harus ≥ 0 karena Pendapatan tidak mungkin negatif kecuali refund atau diskon besar tapi di data reservation_status nya check-out.

Unnamed: 0,hotel,adr,reservation_status
14969,Resort Hotel,-6.38,Check-Out


In [None]:
high_adr = df_hotels[df_hotels['adr'] > 600]
display(high_adr[['hotel', 'adr', 'reservation_status']].sort_values(by='adr', ascending=False).head(10))

# outlier ekstrem (5400), perlu dicek apakah error input atau kasus khusus karena ADR yang ada di data maksimal hanya di 600 namun ini bisa sampai 5400

Unnamed: 0,hotel,adr,reservation_status
48515,City Hotel,5400.0,Canceled


In [None]:
long_weekday = df_hotels[df_hotels['stays_in_week_nights'] > 30]
display(long_weekday[['hotel', 'stays_in_week_nights', 'stays_in_weekend_nights', 'reservation_status']].sort_values(by='stays_in_week_nights', ascending=False).head(10))

# outlier ekstrem lebih dari 30 malam di weekday dan lebih dari 13 weekend, perlu dicek apakah error input atau kasus khusus karena data mayoritas di bawah 10 malam dan itu sudah termasuk anomali

Unnamed: 0,hotel,stays_in_week_nights,stays_in_weekend_nights,reservation_status
14038,Resort Hotel,50,19,Check-Out
14037,Resort Hotel,42,18,Check-Out
101794,City Hotel,41,16,Check-Out
33924,Resort Hotel,40,16,Check-Out
9839,Resort Hotel,40,16,Canceled
88017,City Hotel,35,14,Check-Out
54704,City Hotel,34,14,Check-Out
1655,Resort Hotel,33,13,Check-Out
32589,Resort Hotel,32,13,Check-Out
