https://www.kaggle.com/datasets/bohnacker/country-longitude-latitude?resource=download


In [38]:
import json

import numpy as np
import pandas as pd

In [39]:
MIN_BOOKS_THRESHOLD = 100

In [40]:
file_path = './data/hotel_bookings.csv'
bookings_df = pd.read_csv(file_path)

print(bookings_df.columns)

bookings_df.head()

Index(['Unnamed: 0', 'hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_month', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'adults', 'children', 'babies', 'meal',
       'country', 'market_segment', 'distribution_channel',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'reserved_room_type',
       'assigned_room_type', 'booking_changes', 'deposit_type', 'agent',
       'company', 'days_in_waiting_list', 'customer_type', 'adr',
       'required_car_parking_spaces', 'total_of_special_requests',
       'reservation_status', 'reservation_status_date', 'dia', 'tipo'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,dia,tipo
0,3,Resort Hotel,0,7,2015,July,27,1,0,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,work
1,4,Resort Hotel,0,13,2015,July,27,1,0,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,work
2,5,Resort Hotel,0,14,2015,July,27,1,0,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,work
3,6,Resort Hotel,0,14,2015,July,27,1,0,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,work
4,7,Resort Hotel,0,0,2015,July,27,1,0,2,...,,0,Transient,107.0,0,0,Check-Out,2015-07-03,2015-07-01,work


In [41]:
grouped_countries = bookings_df.groupby(['country']).agg(
    total_books = ('is_canceled', 'count')
).reset_index()

grouped_countries = grouped_countries[grouped_countries['total_books'] >= MIN_BOOKS_THRESHOLD]

relevant_countries = grouped_countries['country']
num_books_before_clean = bookings_df.shape[0]

bookings_df = bookings_df[bookings_df['country'].isin(relevant_countries)]

num_books_after_clean = bookings_df.shape[0]

print(f'A total {num_books_before_clean-num_books_after_clean} of books have been cleaned')

A total 2841 of books have been cleaned


In [42]:
file_path = './data/countries.csv'
countries_df = pd.read_csv(file_path)


countries_df.head()

repeated = countries_df['ISO-ALPHA-3'].value_counts()[countries_df['ISO-ALPHA-3'].value_counts() > 1].index

for country in repeated.to_list():
    if country in bookings_df['country'].unique():
        print(countries_df[countries_df['ISO-ALPHA-3'] == country][['Country', 'ISO-ALPHA-3']])

unused_countries = ["Russian Federation", "China, People's Republic of", "United States", 
                    "United Kingdom of Great Britain and Northern Ireland", "West Germany"]

countries_df = countries_df[~countries_df['Country'].isin(unused_countries)]

                        Country ISO-ALPHA-3
154  Korea, Republic of (South)         KOR
155                 South Korea         KOR
156          Korea, Republic of         KOR
                        Country ISO-ALPHA-3
53                        China         CHN
54  China, People's Republic of         CHN
                Country ISO-ALPHA-3
233  Russian Federation         RUS
234              Russia         RUS
           Country ISO-ALPHA-3
74  Czech Republic         CZE
75         Czechia         CZE
                                               Country ISO-ALPHA-3
107                                     United Kingdom         GBR
108  United Kingdom of Great Britain and Northern I...         GBR
                      Country ISO-ALPHA-3
286  United States of America         USA
287             United States         USA
         Country ISO-ALPHA-3
78       Germany         DEU
79  West Germany         DEU


In [43]:
country_to_iso = {}
iso_to_country = {}


for country in bookings_df['country'].unique():
    iso = countries_df[countries_df['ISO-ALPHA-3'] == country]['Country'].values
    if len(iso) == 0:
        print(country)
        continue
    country_to_iso[iso[0]] = country
    iso_to_country[country] = iso[0]

iso_to_country['CN'] = 'China'
iso_to_country['TMP'] = 'Timor-Leste'
iso_to_country['UMI'] = 'United States Minor Outlying Islands'
print(country_to_iso)
print(iso_to_country)

CN
{'United Kingdom': 'GBR', 'Portugal': 'PRT', 'United States of America': 'USA', 'Spain': 'ESP', 'Ireland': 'IRL', 'France': 'FRA', 'Romania': 'ROU', 'Norway': 'NOR', 'Argentina': 'ARG', 'Poland': 'POL', 'Germany': 'DEU', 'Belgium': 'BEL', 'Switzerland': 'CHE', 'Greece': 'GRC', 'Italy': 'ITA', 'Netherlands': 'NLD', 'Denmark': 'DNK', 'Russia': 'RUS', 'Sweden': 'SWE', 'Australia': 'AUS', 'Czech Republic': 'CZE', 'Brazil': 'BRA', 'Finland': 'FIN', 'Luxembourg': 'LUX', 'India': 'IND', 'China': 'CHN', 'Morocco': 'MAR', 'Serbia': 'SRB', 'Austria': 'AUT', 'Turkey': 'TUR', 'Israel': 'ISR', 'Algeria': 'DZA', 'Korea, Republic of (South)': 'KOR', 'Hungary': 'HUN', 'Croatia': 'HRV', 'Angola': 'AGO', 'Japan': 'JPN'}
{'GBR': 'United Kingdom', 'PRT': 'Portugal', 'USA': 'United States of America', 'ESP': 'Spain', 'IRL': 'Ireland', 'FRA': 'France', 'ROU': 'Romania', 'NOR': 'Norway', 'ARG': 'Argentina', 'POL': 'Poland', 'DEU': 'Germany', 'BEL': 'Belgium', 'CHE': 'Switzerland', 'GRC': 'Greece', 'ITA': 

In [44]:
bookings_df.head()

Unnamed: 0.1,Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,dia,tipo
0,3,Resort Hotel,0,7,2015,July,27,1,0,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,work
1,4,Resort Hotel,0,13,2015,July,27,1,0,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-07-01,work
2,5,Resort Hotel,0,14,2015,July,27,1,0,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,work
3,6,Resort Hotel,0,14,2015,July,27,1,0,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-07-01,work
4,7,Resort Hotel,0,0,2015,July,27,1,0,2,...,,0,Transient,107.0,0,0,Check-Out,2015-07-03,2015-07-01,work


In [45]:
charts_info_dict = {}

In [46]:
num_canceled = sum(bookings_df['is_canceled'] == 1)
num_confirmed = sum(bookings_df['is_canceled'] == 0)
cancel_rate = num_canceled/(num_canceled+num_confirmed) * 100
print(f'There is a total of {num_canceled} of canceled reservations')
print(f'There is a total of {num_confirmed} of confirmed reservations')
print(f'The cancel rate is: {cancel_rate}')

charts_info_dict['chart1'] = {'confirmations': num_confirmed, 'cancelations': num_canceled}

There is a total of 43169 of canceled reservations
There is a total of 71381 of confirmed reservations
The cancel rate is: 37.68572675687473


# Chart 2: Mapa mundial

Per cada país cal tenir:
<ul>
  <li> localització
  <li> total reserves (confirmades/cancelades)
  <li> total reserves per tipus de grup (familia, parella, sol)
  <li> total reserves per lloc de reserva (online, no-online)
  <li> total reserves per antel·lació en mesos( 1, 2, 3, 4, 5, 6, 7+)
</ul>

In [47]:
cancel_summary = bookings_df.groupby(['country']).agg(
    total_books = ('is_canceled', 'count'),
    cancelations = ('is_canceled', 'sum')
).reset_index()

cancel_summary = cancel_summary.loc[cancel_summary['total_books'] >= MIN_BOOKS_THRESHOLD]

cancel_summary['confirmations'] = cancel_summary['total_books'] - cancel_summary['cancelations']
cancel_summary['cancelations_rate'] = cancel_summary['cancelations'] / cancel_summary['total_books'] * 100

cancel_summary = cancel_summary.sort_values(by='cancelations_rate', ascending=False)

cancel_summary.head()

Unnamed: 0,country,total_books,cancelations,confirmations,cancelations_rate
31,PRT,47020,27326,19694,58.115695
0,AGO,355,205,150,57.746479
7,CHN,997,462,535,46.339017
27,MAR,256,109,147,42.578125
25,KOR,133,55,78,41.353383


In [48]:
charts_info_dict['chart2'] = {}

for _, country_data in cancel_summary.iterrows():
    country = country_data['country']
    charts_info_dict['chart2'][iso_to_country[country]] = {
        'total_books': country_data['total_books'],
        'cancelations': country_data['cancelations'],
        'confirmations': country_data['confirmations'],
        'cancelations_rate': country_data['cancelations_rate']
    }

In [49]:
conditions = [
    (bookings_df['lead_time'] > 180),
    (bookings_df['lead_time'] <= 180) & (bookings_df['lead_time'] > 150),
    (bookings_df['lead_time'] <= 150) & (bookings_df['lead_time'] > 120),
    (bookings_df['lead_time'] <= 120) & (bookings_df['lead_time'] > 90),
    (bookings_df['lead_time'] <= 90) & (bookings_df['lead_time'] > 60),
    (bookings_df['lead_time'] <= 60) & (bookings_df['lead_time'] > 30),
    (bookings_df['lead_time'] <= 30)
]

values = [7, 6, 5, 4, 3, 2, 1]

bookings_df['lead_months'] = np.select(conditions, values)
bookings_df['lead_months'].value_counts()

lead_months
1    35863
7    24233
2    16397
3    12239
4    10137
5     7950
6     7731
Name: count, dtype: int64

In [50]:
cancel_summary = bookings_df.groupby(['lead_months']).agg(
    total_books = ('is_canceled', 'count'),
    cancelations = ('is_canceled', 'sum')
).reset_index()

cancel_summary['confirmations'] = cancel_summary['total_books'] - cancel_summary['cancelations']
cancel_summary['cancelations_rate'] = cancel_summary['cancelations'] / cancel_summary['total_books'] * 100
cancel_summary = cancel_summary.sort_values(by='lead_months', ascending=False)

print(cancel_summary)

   lead_months  total_books  cancelations  confirmations  cancelations_rate
6            7        24233         13911          10322          57.405191
5            6         7731          3562           4169          46.074247
4            5         7950          3523           4427          44.314465
3            4        10137          4575           5562          45.131696
2            3        12239          4865           7374          39.749980
1            2        16397          5987          10410          36.512777
0            1        35863          6746          29117          18.810473


In [51]:
charts_info_dict['chart3'] = {}

for _, lead_month_data in cancel_summary.iterrows():
    charts_info_dict['chart3'][lead_month_data['lead_months']] = {
        'total_books': lead_month_data['total_books'],
        'cancelations': lead_month_data['cancelations'],
        'confirmations': lead_month_data['confirmations'],
        'cancelations_rate': lead_month_data['cancelations_rate']
    }

In [52]:
bookings_df['distribution_channel'].value_counts()

distribution_channel
TA/TO        94690
Direct       13443
Corporate     6231
GDS            185
Undefined        1
Name: count, dtype: int64

In [53]:
output_json_path = '../public/data/hotel_bookings.json'
with open(output_json_path, 'w', encoding='utf-8') as file:
    json.dump(charts_info_dict, file, ensure_ascii=False, indent=4)