In [1]:
import pandas as pd
import os
import re
import datetime
from names_dataset import NameDataset

Combining booking files from 2021 and 2022

In [2]:
path = "./Raw data/"

df = pd.concat([pd.read_excel(path + "Booking_2021.xlsx"), pd.read_excel(path + "Booking_2022.xlsx")], ignore_index = True)
df.head()

Unnamed: 0,guest,arrival_date,departure_date,room,booking_date,status,price,commision,booking_id,occypancy,number_of_guests
0,Richard Young,2021-07-13,2021-07-18,Studio with Patio and Sea View,2020-01-07,OK,310.0,55.8,2177211152,2 guests,2.0
1,Eileen Quinn,2021-09-18,2021-09-21,One-Bedroom Apartment with Balcony and Sea View,2020-01-09,OK,186.0,33.48,3595829045,2 guests,2.0
2,Horváth Zsuzsanna,2021-07-26,2021-08-02,One-Bedroom Apartment with Patio and Sea View,2021-05-06,OK,518.0,93.24,2184747030,4 adults,4.0
3,Sirok Gábor,2021-07-16,2021-07-30,One-Bedroom Apartment with Balcony and Sea View,2021-05-07,OK,1016.0,182.88,3216185971,4 adults,4.0
4,Serg Kostrukov,2021-06-28,2021-06-30,One-Bedroom Apartment with Balcony and Sea View,2021-05-14,OK,108.0,19.44,2149858814,4 adults,4.0


Extraction of total number of guests from "ccupancy"

In [3]:
# function to extract first number from string
def firstNumFromString(string):
    numbers = re.findall(r'\d+', string)
    return int(numbers[0])

# function to extract second number from string
def secondNumFromString(string):
    numbers = re.findall(r'\d+', string)
    if len(numbers) == 1:
        return 0
    return int(numbers[1])

In [4]:
df["adults"] = df['occypancy'].apply(lambda x: firstNumFromString(x))
df["kids"] = df['occypancy'].apply(lambda x: secondNumFromString(x))

df["# of guests"] = df["adults"] + df["kids"]

Dropping insignificant columns

In [5]:
drop_columns = []

drop_columns.append("occypancy")
drop_columns.append("number_of_guests")
drop_columns.append("adults")
drop_columns.append("kids")
drop_columns.append("booking_id")

df.drop(columns = drop_columns, inplace = True)
df.head()

Unnamed: 0,guest,arrival_date,departure_date,room,booking_date,status,price,commision,# of guests
0,Richard Young,2021-07-13,2021-07-18,Studio with Patio and Sea View,2020-01-07,OK,310.0,55.8,2
1,Eileen Quinn,2021-09-18,2021-09-21,One-Bedroom Apartment with Balcony and Sea View,2020-01-09,OK,186.0,33.48,2
2,Horváth Zsuzsanna,2021-07-26,2021-08-02,One-Bedroom Apartment with Patio and Sea View,2021-05-06,OK,518.0,93.24,4
3,Sirok Gábor,2021-07-16,2021-07-30,One-Bedroom Apartment with Balcony and Sea View,2021-05-07,OK,1016.0,182.88,4
4,Serg Kostrukov,2021-06-28,2021-06-30,One-Bedroom Apartment with Balcony and Sea View,2021-05-14,OK,108.0,19.44,4


Formating dates

In [6]:
df['departure_date'] = pd.to_datetime(df['departure_date'])
df['arrival_date'] = pd.to_datetime(df['arrival_date'])
df['booking_date'] = pd.to_datetime(df['booking_date'])

Calculating number of days between booking date and arrival date, number of stay nights and earnings before taxes

In [7]:
df['# of days pre booked'] = (df['arrival_date'] - df['booking_date']).dt.days
df['# of nights'] = (df['departure_date'] - df['arrival_date']).dt.days

df["Earnings"] = df["price"] + df["commision"]

Renaming columns

In [8]:
df.rename(columns = {"guest" : "Guest name", "arrival_date":"Start date", "departure_date":"End date", "room":"Listing name", "booking_date":"Booked", "status":"Status", "price":"Earnings after Tax", "commision":"Tax"}, inplace = True)
df.head()

Unnamed: 0,Guest name,Start date,End date,Listing name,Booked,Status,Earnings after Tax,Tax,# of guests,# of days pre booked,# of nights,Earnings
0,Richard Young,2021-07-13,2021-07-18,Studio with Patio and Sea View,2020-01-07,OK,310.0,55.8,2,553,5,365.8
1,Eileen Quinn,2021-09-18,2021-09-21,One-Bedroom Apartment with Balcony and Sea View,2020-01-09,OK,186.0,33.48,2,618,3,219.48
2,Horváth Zsuzsanna,2021-07-26,2021-08-02,One-Bedroom Apartment with Patio and Sea View,2021-05-06,OK,518.0,93.24,4,81,7,611.24
3,Sirok Gábor,2021-07-16,2021-07-30,One-Bedroom Apartment with Balcony and Sea View,2021-05-07,OK,1016.0,182.88,4,70,14,1198.88
4,Serg Kostrukov,2021-06-28,2021-06-30,One-Bedroom Apartment with Balcony and Sea View,2021-05-14,OK,108.0,19.44,4,45,2,127.44


Replacing 'OK' status with 'Confirmed' to be compatable with Airbnb data

In [9]:
df['Status'] = df['Status'].replace('OK', 'Confirmed')
df["Status"].value_counts()

Confirmed     123
Smart Flex     13
Risk-Free      10
Name: Status, dtype: int64

Splitting multiple rooms in the same reservation to multiple rows

In [10]:
# checking to see if there are multiple room reservations
print(df["Listing name"].unique())

['Studio with Patio and Sea View'
 'One-Bedroom Apartment with Balcony and Sea View'
 'One-Bedroom Apartment with Patio and Sea View'
 '1 x One-Bedroom Apartment with Balcony and Sea View, 1 x Studio with Patio and Sea View'
 'Studio with Patio'
 '1 x One-Bedroom Apartment with Balcony and Sea View, 1 x One-Bedroom Apartment with Patio and Sea View'
 '1 x Studio with Patio and Sea View, 1 x One-Bedroom Apartment with Patio and Sea View']


In [11]:
# finding their positions in data frame
i = 0
positions = []

for index, row in df.iterrows():
    i += 1
    if ',' in row['Listing name']:
        positions.append(i-1)

print('Multiple room reservations are at indexes:', positions)

Multiple room reservations are at indexes: [8, 37, 58]


In [12]:
df.loc[positions]

Unnamed: 0,Guest name,Start date,End date,Listing name,Booked,Status,Earnings after Tax,Tax,# of guests,# of days pre booked,# of nights,Earnings
8,Edyta Frączek,2021-07-01,2021-07-11,1 x One-Bedroom Apartment with Balcony and Sea...,2021-06-01,Confirmed,1080.0,194.4,2,30,10,1274.4
37,Yuliya Slobodyan,2021-08-15,2021-08-16,1 x One-Bedroom Apartment with Balcony and Sea...,2021-07-02,Confirmed,118.0,21.24,2,44,1,139.24
58,Radoslav Raykov,2021-08-07,2021-08-08,"1 x Studio with Patio and Sea View, 1 x One-Be...",2021-07-20,Risk-Free,128.0,23.04,6,18,1,151.04


In [13]:
new_df = pd.DataFrame(columns = df.columns)

for index, row in df.iterrows():
    if ',' in row['Listing name']:
        values = row['Listing name'].split(',')
        for value in values:
            # earnings, tax and earnings after taxes are divided by 2
            new_row = {'Guest name':row['Guest name'], 'Start date':row['Start date'], 'End date':row['End date'], 'Listing name': value, 'Booked':row['Booked'], 'Status':row['Status'], \
                       'Earnings after Tax': row['Earnings after Tax']/2, 'Tax':row['Tax']/2, '# of guests':row["# of guests"]/2, '# of days pre booked':row['# of days pre booked'], \
                       '# of nights':row['# of nights'], 'Earnings':row['Earnings']/2}
            new_df = new_df.append(new_row, ignore_index=True)
    else:
        new_df = new_df.append(row, ignore_index = True)

In [14]:
new_positions = []
i = 0

for p in positions:
    new_positions.extend([p+i, p+i+1])
    i += 1

print('New index positions are:', new_positions)

New index positions are: [8, 9, 38, 39, 60, 61]


In [15]:
new_df.loc[new_positions]

Unnamed: 0,Guest name,Start date,End date,Listing name,Booked,Status,Earnings after Tax,Tax,# of guests,# of days pre booked,# of nights,Earnings
8,Edyta Frączek,2021-07-01,2021-07-11,1 x One-Bedroom Apartment with Balcony and Sea...,2021-06-01,Confirmed,540.0,97.2,1.0,30,10,637.2
9,Edyta Frączek,2021-07-01,2021-07-11,1 x Studio with Patio and Sea View,2021-06-01,Confirmed,540.0,97.2,1.0,30,10,637.2
38,Yuliya Slobodyan,2021-08-15,2021-08-16,1 x One-Bedroom Apartment with Balcony and Sea...,2021-07-02,Confirmed,59.0,10.62,1.0,44,1,69.62
39,Yuliya Slobodyan,2021-08-15,2021-08-16,1 x One-Bedroom Apartment with Patio and Sea ...,2021-07-02,Confirmed,59.0,10.62,1.0,44,1,69.62
60,Radoslav Raykov,2021-08-07,2021-08-08,1 x Studio with Patio and Sea View,2021-07-20,Risk-Free,64.0,11.52,3.0,18,1,75.52
61,Radoslav Raykov,2021-08-07,2021-08-08,1 x One-Bedroom Apartment with Patio and Sea ...,2021-07-20,Risk-Free,64.0,11.52,3.0,18,1,75.52


Parse values to not contain "1 x"

In [16]:
# function to subtract "1 x " and " 1 x " from string
def remove_prefix(string):
    return re.sub("^1 x |^ 1 x ", "", string)

In [17]:
new_df["Listing name"] = new_df['Listing name'].apply(lambda x: remove_prefix(x))
new_df.loc[new_positions]

Unnamed: 0,Guest name,Start date,End date,Listing name,Booked,Status,Earnings after Tax,Tax,# of guests,# of days pre booked,# of nights,Earnings
8,Edyta Frączek,2021-07-01,2021-07-11,One-Bedroom Apartment with Balcony and Sea View,2021-06-01,Confirmed,540.0,97.2,1.0,30,10,637.2
9,Edyta Frączek,2021-07-01,2021-07-11,Studio with Patio and Sea View,2021-06-01,Confirmed,540.0,97.2,1.0,30,10,637.2
38,Yuliya Slobodyan,2021-08-15,2021-08-16,One-Bedroom Apartment with Balcony and Sea View,2021-07-02,Confirmed,59.0,10.62,1.0,44,1,69.62
39,Yuliya Slobodyan,2021-08-15,2021-08-16,One-Bedroom Apartment with Patio and Sea View,2021-07-02,Confirmed,59.0,10.62,1.0,44,1,69.62
60,Radoslav Raykov,2021-08-07,2021-08-08,Studio with Patio and Sea View,2021-07-20,Risk-Free,64.0,11.52,3.0,18,1,75.52
61,Radoslav Raykov,2021-08-07,2021-08-08,One-Bedroom Apartment with Patio and Sea View,2021-07-20,Risk-Free,64.0,11.52,3.0,18,1,75.52


Combining data frame with ApartmentTypes.txt

In [18]:
df_listing = pd.read_csv(path + "ApartmentTypes.txt", index_col = "Naziv")

new_df["# of beds"] = (df_listing.loc[new_df["Listing name"], "BrojKreveta"]).to_numpy()

new_df.head()

Unnamed: 0,Guest name,Start date,End date,Listing name,Booked,Status,Earnings after Tax,Tax,# of guests,# of days pre booked,# of nights,Earnings,# of beds
0,Richard Young,2021-07-13,2021-07-18,Studio with Patio and Sea View,2020-01-07,Confirmed,310.0,55.8,2,553,5,365.8,2
1,Eileen Quinn,2021-09-18,2021-09-21,One-Bedroom Apartment with Balcony and Sea View,2020-01-09,Confirmed,186.0,33.48,2,618,3,219.48,4
2,Horváth Zsuzsanna,2021-07-26,2021-08-02,One-Bedroom Apartment with Patio and Sea View,2021-05-06,Confirmed,518.0,93.24,4,81,7,611.24,4
3,Sirok Gábor,2021-07-16,2021-07-30,One-Bedroom Apartment with Balcony and Sea View,2021-05-07,Confirmed,1016.0,182.88,4,70,14,1198.88,4
4,Serg Kostrukov,2021-06-28,2021-06-30,One-Bedroom Apartment with Balcony and Sea View,2021-05-14,Confirmed,108.0,19.44,4,45,2,127.44,4


Adding guests origin based on first name and last name (pip install names-dataset)

In [19]:
nd = NameDataset()

In [20]:
print(nd.search("Walter"))

{'first_name': {'country': {'Argentina': 0.062, 'Austria': 0.037, 'Bolivia, Plurinational State of': 0.042, 'Colombia': 0.096, 'Germany': 0.044, 'Italy': 0.295, 'Peru': 0.185, 'United States': 0.159, 'Uruguay': 0.036, 'South Africa': 0.043}, 'gender': {'Female': 0.007, 'Male': 0.993}, 'rank': {'Argentina': 37, 'Austria': 34, 'Bolivia, Plurinational State of': 67, 'Colombia': 250, 'Germany': 214, 'Italy': 193, 'Peru': 27, 'United States': 317, 'Uruguay': 44, 'South Africa': 388}}, 'last_name': {'country': {'Austria': 0.036, 'Brazil': 0.039, 'Switzerland': 0.032, 'Germany': 0.299, 'France': 0.121, 'United Kingdom': 0.048, 'Italy': 0.09, 'Nigeria': 0.078, 'United States': 0.172, 'South Africa': 0.085}, 'gender': {}, 'rank': {'Austria': 106, 'Brazil': 805, 'Switzerland': 140, 'Germany': 39, 'France': 625, 'United Kingdom': 1823, 'Italy': 3564, 'Nigeria': 926, 'United States': 1210, 'South Africa': 1169}}}


In [21]:
def name_origin(name, lastname):

    # get probabilities of contries by first name
    name_prob = nd.search(name)["first_name"]
    if (name_prob is None):
        name_prob = None
    else:
        name_prob = name_prob["country"]

    # get probabilities of contries by last name
    lastname_prob = nd.search(lastname)["last_name"]
    if (lastname_prob is None):
        lastname_prob = None
    else:
        lastname_prob = lastname_prob["country"]

    prob = 0
    country = "Unknown"

    # get max probability for last name
    if lastname_prob:
        max_value = max(lastname_prob, key=lastname_prob.get)
        if(lastname_prob[max_value] > prob):
            country = (max_value)
            prob = (lastname_prob[max_value])

    # get max probability for first name and compare it with last name prob
    if name_prob:
        max_value = max(name_prob, key=name_prob.get)
        if(name_prob[max_value] > prob):
            country = (max_value)
            prob = (name_prob[max_value])

    return country

In [22]:
# * is used to unpack elements of a collection
new_df["Origin"] = new_df["Guest name"].apply(lambda x: name_origin(*(x.split(' ')[:2])))

new_df["Advertiser"] = "Booking"

Rearranging columns

In [23]:
new_df = new_df[["Status", "Guest name", "Origin", "# of guests", "Booked", "Start date", "End date", "# of nights", "# of days pre booked", "Listing name", "# of beds", "Earnings", "Tax", "Earnings after Tax", "Advertiser"]]

Exporting data frame as excel

In [24]:
exportPath = "./Processed data/"

new_df.to_excel(exportPath + "Final_Booking.xlsx")
new_df.head()

Unnamed: 0,Status,Guest name,Origin,# of guests,Booked,Start date,End date,# of nights,# of days pre booked,Listing name,# of beds,Earnings,Tax,Earnings after Tax,Advertiser
0,Confirmed,Richard Young,United States,2,2020-01-07,2021-07-13,2021-07-18,5,553,Studio with Patio and Sea View,2,365.8,55.8,310.0,Booking
1,Confirmed,Eileen Quinn,United Kingdom,2,2020-01-09,2021-09-18,2021-09-21,3,618,One-Bedroom Apartment with Balcony and Sea View,4,219.48,33.48,186.0,Booking
2,Confirmed,Horváth Zsuzsanna,Hungary,4,2021-05-06,2021-07-26,2021-08-02,7,81,One-Bedroom Apartment with Patio and Sea View,4,611.24,93.24,518.0,Booking
3,Confirmed,Sirok Gábor,Hungary,4,2021-05-07,2021-07-16,2021-07-30,14,70,One-Bedroom Apartment with Balcony and Sea View,4,1198.88,182.88,1016.0,Booking
4,Confirmed,Serg Kostrukov,Russian Federation,4,2021-05-14,2021-06-28,2021-06-30,2,45,One-Bedroom Apartment with Balcony and Sea View,4,127.44,19.44,108.0,Booking
