https://www.kaggle.com/datasets/bohnacker/country-longitude-latitude?resource=download


In [43]:
import json

import numpy as np
import pandas as pd

In [44]:
MIN_BOOKS_THRESHOLD = 500

In [45]:
file_path = './data/hotel_bookings.csv'
bookings_df = pd.read_csv(file_path)

print(bookings_df.columns)

bookings_df.head()

Index(['Unnamed: 0', 'hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'dia', 'tipo'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,dia,tipo
0,3,Resort Hotel,0,7,2015,July,27,1,0,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,work
1,4,Resort Hotel,0,13,2015,July,27,1,0,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,work
2,5,Resort Hotel,0,14,2015,July,27,1,0,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,work
3,6,Resort Hotel,0,14,2015,July,27,1,0,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,work
4,7,Resort Hotel,0,0,2015,July,27,1,0,2,...,,0,Transient,107.0,0,0,Check-Out,2015-07-03,2015-07-01,work


In [46]:
grouped_countries = bookings_df.groupby(['country']).agg(
    total_books = ('is_canceled', 'count')
).reset_index()

grouped_countries = grouped_countries[grouped_countries['total_books'] >= MIN_BOOKS_THRESHOLD]

relevant_countries = grouped_countries['country']
num_books_before_clean = bookings_df.shape[0]

bookings_df = bookings_df[bookings_df['country'].isin(relevant_countries)]

num_books_after_clean = bookings_df.shape[0]

print(f'A total {num_books_before_clean-num_books_after_clean} of books have been cleaned')

A total 6812 of books have been cleaned


In [47]:
file_path = './data/countries.csv'
countries_df = pd.read_csv(file_path)


countries_df.head()

repeated = countries_df['ISO-ALPHA-3'].value_counts()[countries_df['ISO-ALPHA-3'].value_counts() > 1].index

for country in repeated.to_list():
    if country in bookings_df['country'].unique():
        print(countries_df[countries_df['ISO-ALPHA-3'] == country][['Country', 'ISO-ALPHA-3']])

unused_countries = ["Russian Federation", "China, People's Republic of", "United States of America", 
                    "United Kingdom of Great Britain and Northern Ireland", "West Germany"]

countries_df = countries_df[~countries_df['Country'].isin(unused_countries)]

                        Country ISO-ALPHA-3
53                        China         CHN
54  China, People's Republic of         CHN
                Country ISO-ALPHA-3
233  Russian Federation         RUS
234              Russia         RUS
                                               Country ISO-ALPHA-3
107                                     United Kingdom         GBR
108  United Kingdom of Great Britain and Northern I...         GBR
                      Country ISO-ALPHA-3
286  United States of America         USA
287             United States         USA
         Country ISO-ALPHA-3
78       Germany         DEU
79  West Germany         DEU


In [48]:
country_to_iso = {}
iso_to_country = {}


for country in bookings_df['country'].unique():
    iso = countries_df[countries_df['ISO-ALPHA-3'] == country]['Country'].values
    if len(iso) == 0:
        print(country)
        continue
    country_to_iso[iso[0]] = country
    iso_to_country[country] = iso[0]

iso_to_country['CN'] = 'China'
print(country_to_iso)
print(iso_to_country)

CN
{'United Kingdom': 'GBR', 'Portugal': 'PRT', 'United States': 'USA', 'Spain': 'ESP', 'Ireland': 'IRL', 'France': 'FRA', 'Romania': 'ROU', 'Norway': 'NOR', 'Poland': 'POL', 'Germany': 'DEU', 'Belgium': 'BEL', 'Switzerland': 'CHE', 'Italy': 'ITA', 'Netherlands': 'NLD', 'Russia': 'RUS', 'Sweden': 'SWE', 'Brazil': 'BRA', 'China': 'CHN', 'Austria': 'AUT', 'Israel': 'ISR'}
{'GBR': 'United Kingdom', 'PRT': 'Portugal', 'USA': 'United States', 'ESP': 'Spain', 'IRL': 'Ireland', 'FRA': 'France', 'ROU': 'Romania', 'NOR': 'Norway', 'POL': 'Poland', 'DEU': 'Germany', 'BEL': 'Belgium', 'CHE': 'Switzerland', 'ITA': 'Italy', 'NLD': 'Netherlands', 'RUS': 'Russia', 'SWE': 'Sweden', 'BRA': 'Brazil', 'CHN': 'China', 'AUT': 'Austria', 'ISR': 'Israel', 'CN': 'China'}


In [49]:
bookings_df.head()

Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,dia,tipo
0,3,Resort Hotel,0,7,2015,July,27,1,0,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,work
1,4,Resort Hotel,0,13,2015,July,27,1,0,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,work
2,5,Resort Hotel,0,14,2015,July,27,1,0,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,work
3,6,Resort Hotel,0,14,2015,July,27,1,0,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,work
4,7,Resort Hotel,0,0,2015,July,27,1,0,2,...,,0,Transient,107.0,0,0,Check-Out,2015-07-03,2015-07-01,work


In [50]:
charts_info_dict = {}

In [51]:
num_canceled = sum(bookings_df['is_canceled'] == 1)
num_confirmed = sum(bookings_df['is_canceled'] == 0)
cancel_rate = num_canceled/(num_canceled+num_confirmed) * 100
print(f'There is a total of {num_canceled} of canceled reservations')
print(f'There is a total of {num_confirmed} of confirmed reservations')
print(f'The cancel rate is: {cancel_rate}')

charts_info_dict['chart1'] = {'confirmed': num_confirmed, 'canceled': num_canceled}

There is a total of 41991 of canceled reservations
There is a total of 68588 of confirmed reservations
The cancel rate is: 37.97375631901175


# Chart 2: Mapa mundial

Per cada país cal tenir:
<ul>
  <li> localització
  <li> total reserves (confirmades/cancelades)
  <li> total reserves per tipus de grup (familia, parella, sol)
  <li> total reserves per lloc de reserva (online, no-online)
  <li> total reserves per antel·lació en mesos( 1, 2, 3, 4, 5, 6, 7+)
</ul>

In [52]:
cancel_summary = bookings_df.groupby(['country']).agg(
    total_books = ('is_canceled', 'count'),
    cancelations = ('is_canceled', 'sum')
).reset_index()

cancel_summary = cancel_summary.loc[cancel_summary['total_books'] >= MIN_BOOKS_THRESHOLD]

cancel_summary['confirmations'] = cancel_summary['total_books'] - cancel_summary['cancelations']
cancel_summary['cancelations_rate'] = cancel_summary['cancelations'] / cancel_summary['total_books'] * 100

cancel_summary = cancel_summary.sort_values(by='cancelations_rate', ascending=False)

print(cancel_summary)


   country  total_books  cancelations  confirmations  cancelations_rate
16     PRT        47020         27326          19694          58.115695
4      CHN          997           462            535          46.339017
18     RUS          623           239            384          38.362761
2      BRA         2210           829           1381          37.511312
12     ITA         3749          1332           2417          35.529475
14     NOR          606           181            425          29.867987
17     ROU          500           134            366          26.800000
7      ESP         8488          2176           6312          25.636192
11     ISR          666           169            497          25.375375
3      CHE         1721           428           1293          24.869262
10     IRL         3369           832           2537          24.695755
20     USA         2085           500           1585          23.980815
15     POL          909           213            696          23

In [53]:
charts_info_dict['chart2'] = {}

for _, country_data in cancel_summary.iterrows():
    country = country_data['country']
    charts_info_dict['chart2'][iso_to_country[country]] = {
        'total_books': country_data['total_books'],
        'cancelations': country_data['cancelations'],
        'confirmations': country_data['confirmations'],
        'cancelations_rate': country_data['cancelations_rate']
    }

In [54]:
conditions = [
    (bookings_df['lead_time'] > 180),
    (bookings_df['lead_time'] <= 180) & (bookings_df['lead_time'] > 150),
    (bookings_df['lead_time'] <= 150) & (bookings_df['lead_time'] > 120),
    (bookings_df['lead_time'] <= 120) & (bookings_df['lead_time'] > 90),
    (bookings_df['lead_time'] <= 90) & (bookings_df['lead_time'] > 60),
    (bookings_df['lead_time'] <= 60) & (bookings_df['lead_time'] > 30),
    (bookings_df['lead_time'] <= 30)
]

values = [7, 6, 5, 4, 3, 2, 1]

bookings_df['lead_months'] = np.select(conditions, values)
bookings_df['lead_months'].value_counts()

lead_months
1    34243
7    23731
2    15756
3    11813
4     9804
5     7704
6     7528
Name: count, dtype: int64

In [55]:
bookings_df['distribution_channel'].value_counts()

distribution_channel
TA/TO        91361
Direct       12938
Corporate     6104
GDS            175
Undefined        1
Name: count, dtype: int64

In [56]:
output_json_path = '../hotel-booking/public/data/hotel_bookings.json'
with open(output_json_path, 'w', encoding='utf-8') as file:
    json.dump(charts_info_dict, file, ensure_ascii=False, indent=4)