In [1]:
import pandas as pd
import os
import re
import datetime
from names_dataset import NameDataset

Getting all airbnb file names

In [2]:
path = "./original files/"
excel_files = []

for file in os.listdir(path):
    if file.startswith("Airbnb") and file.endswith(".xlsx"):
        excel_files.append(file)

excel_files

['Airbnb_2019.xlsx',
 'Airbnb_2020.xlsx',
 'Airbnb_2021.xlsx',
 'Airbnb_2022.xlsx']

Formatting Guest name and Earnings columns

In [3]:
# function to convert cp1252 format to UTF-8 format
def cp2utf(name):
    try:
        return name.encode("cp1252").decode("UTF-8")
    except Exception as e:
        print(f'error: {e} for name {name}')
        return name

In [4]:
for file in excel_files:
    df = pd.read_excel(path + file)
    df["Guest name"] = df["Guest name"].apply(lambda x: cp2utf(x))
    df["Earnings"] = df["Earnings"].apply(lambda x: float(x[3:].strip()))
    df.to_excel(path + f"codec_{file}")

error: 'charmap' codec can't encode characters in position 0-5: character maps to <undefined> for name Андрій Щур


Combining formated airbnb excel files into one

In [5]:
df = pd.DataFrame()

for file in os.listdir(path):
    if file.startswith("codec_") and file.endswith(".xlsx"):
        df = pd.concat([df, pd.read_excel(path + file, index_col = 0)], ignore_index = True)

df.to_excel(path + "combinedAirbnb.xlsx")


Combining data frame with ApartmentTypes.txt

In [6]:
df = pd.read_excel(path + "combinedAirbnb.xlsx", index_col = 0)

df.rename(columns = {"Listing" : "Listing description", "Listing name" : "Listing code"}, inplace = True)

df_listing = pd.read_csv(path + "ApartmentTypes.txt", index_col = "InterniNaziv")

df["# of beds"] = (df_listing.loc[df["Listing code"], "BrojKreveta"]).to_numpy()
df["Listing name"] = (df_listing.loc[df["Listing code"], "Naziv"]).to_numpy()

df.head()

Unnamed: 0,Confirmation code,Status,Guest name,# of adults,# of children,# of infants,Start date,End date,# of nights,Booked,Listing description,Earnings,Listing code,Contact,# of beds,Listing name
0,HMANM3BZNK,Confirmed,Egor Zhidkov,2,0,0,2019-06-20,2019-06-22,2,2019-03-28,Beautiful studio with patio 5 min. to old town,67.9,B,,2,Studio with Patio
1,HMAMP59PS4,Confirmed,Youstina Daoud,2,0,0,2019-06-03,2019-06-05,2,2019-03-30,Beautiful studio with patio 5 min. to old town,67.9,B,,2,Studio with Patio
2,HMAC8EWZ3P,Confirmed,Öznur Balaban,2,0,0,2019-06-08,2019-06-09,1,2019-04-09,Beautiful studio with patio 5 min. to old town,33.95,B,,2,Studio with Patio
3,HMAANWWMXF,Confirmed,Михаил Кодолов,2,0,0,2019-06-16,2019-06-20,4,2019-05-01,Beautiful studio with patio 5 min. to old town,135.8,B,,2,Studio with Patio
4,HMAQTTMMNJ,Confirmed,Janie Macpherson,1,0,0,2019-09-18,2019-09-23,5,2019-05-02,Beautiful studio with patio 5 min. to old town,169.75,B,,2,Studio with Patio


Dropping insignificant columns and columns with one unique value

In [7]:
drop_columns = []

for column in df.columns:
    if (len(df[column].unique()) == 1):
        drop_columns.append(column)
        print("Column name:", column, "| Value:", *(df[column].unique())) 

Column name: # of children | Value: 0
Column name: # of infants | Value: 0


In [8]:
df.rename(columns = {"# of adults" : "# of guests"}, inplace = True)

In [9]:
drop_columns.append("Listing description")
drop_columns.append("Listing code")
drop_columns.append("Confirmation code")
drop_columns.append("Contact")

df.drop(columns = drop_columns, inplace = True)

df.head()

Unnamed: 0,Status,Guest name,# of guests,Start date,End date,# of nights,Booked,Earnings,# of beds,Listing name
0,Confirmed,Egor Zhidkov,2,2019-06-20,2019-06-22,2,2019-03-28,67.9,2,Studio with Patio
1,Confirmed,Youstina Daoud,2,2019-06-03,2019-06-05,2,2019-03-30,67.9,2,Studio with Patio
2,Confirmed,Öznur Balaban,2,2019-06-08,2019-06-09,1,2019-04-09,33.95,2,Studio with Patio
3,Confirmed,Михаил Кодолов,2,2019-06-16,2019-06-20,4,2019-05-01,135.8,2,Studio with Patio
4,Confirmed,Janie Macpherson,1,2019-09-18,2019-09-23,5,2019-05-02,169.75,2,Studio with Patio


Calculating number of days prebooked, taxes and deducting them from earnings

In [10]:
# Taxes are 1 dollar per adult per night
df["Tax"] = df["# of guests"] * df["# of nights"]
df["Earnings after Tax"] = df["Earnings"] - df["Tax"]
df["# of days pre booked"] = (df["Start date"] - df["Booked"]).dt.days

df.head()

Unnamed: 0,Status,Guest name,# of guests,Start date,End date,# of nights,Booked,Earnings,# of beds,Listing name,Tax,Earnings after Tax,# of days pre booked
0,Confirmed,Egor Zhidkov,2,2019-06-20,2019-06-22,2,2019-03-28,67.9,2,Studio with Patio,4,63.9,84
1,Confirmed,Youstina Daoud,2,2019-06-03,2019-06-05,2,2019-03-30,67.9,2,Studio with Patio,4,63.9,65
2,Confirmed,Öznur Balaban,2,2019-06-08,2019-06-09,1,2019-04-09,33.95,2,Studio with Patio,2,31.95,60
3,Confirmed,Михаил Кодолов,2,2019-06-16,2019-06-20,4,2019-05-01,135.8,2,Studio with Patio,8,127.8,46
4,Confirmed,Janie Macpherson,1,2019-09-18,2019-09-23,5,2019-05-02,169.75,2,Studio with Patio,5,164.75,139


Adding guests origin based on first name and last name (pip install names-dataset)

In [11]:
nd = NameDataset()

In [12]:
print(nd.search("Walter"))

{'first_name': {'country': {'Argentina': 0.062, 'Austria': 0.037, 'Bolivia, Plurinational State of': 0.042, 'Colombia': 0.096, 'Germany': 0.044, 'Italy': 0.295, 'Peru': 0.185, 'United States': 0.159, 'Uruguay': 0.036, 'South Africa': 0.043}, 'gender': {'Female': 0.007, 'Male': 0.993}, 'rank': {'Argentina': 37, 'Austria': 34, 'Bolivia, Plurinational State of': 67, 'Colombia': 250, 'Germany': 214, 'Italy': 193, 'Peru': 27, 'United States': 317, 'Uruguay': 44, 'South Africa': 388}}, 'last_name': {'country': {'Austria': 0.036, 'Brazil': 0.039, 'Switzerland': 0.032, 'Germany': 0.299, 'France': 0.121, 'United Kingdom': 0.048, 'Italy': 0.09, 'Nigeria': 0.078, 'United States': 0.172, 'South Africa': 0.085}, 'gender': {}, 'rank': {'Austria': 106, 'Brazil': 805, 'Switzerland': 140, 'Germany': 39, 'France': 625, 'United Kingdom': 1823, 'Italy': 3564, 'Nigeria': 926, 'United States': 1210, 'South Africa': 1169}}}


In [13]:
def name_origin(name, lastname):

    # get probabilities of contries by first name
    name_prob = nd.search(name)["first_name"]
    if (name_prob is None):
        name_prob = None
    else:
        name_prob = name_prob["country"]

    # get probabilities of contries by last name
    lastname_prob = nd.search(lastname)["last_name"]
    if (lastname_prob is None):
        lastname_prob = None
    else:
        lastname_prob = lastname_prob["country"]

    prob = 0
    country = "Unknown"

    # get max probability for last name
    if lastname_prob:
        max_value = max(lastname_prob, key=lastname_prob.get)
        if(lastname_prob[max_value] > prob):
            country = (max_value)
            prob = (lastname_prob[max_value])

    # get max probability for first name and compare it with last name prob
    if name_prob:
        max_value = max(name_prob, key=name_prob.get)
        if(name_prob[max_value] > prob):
            country = (max_value)
            prob = (name_prob[max_value])

    return country

In [14]:
# * is used to unpack elements of a collection
df["Origin"] = df["Guest name"].apply(lambda x: name_origin(*(x.split(' ')[:2])))

df["Advertiser"] = "Airbnb"

In [15]:
print(df["Status"].value_counts())

Confirmed              105
Arriving in 49 days      1
Arriving in 82 days      1
Arriving in 13 days      1
Arriving tomorrow        1
Name: Status, dtype: int64


Rearranging columns

In [16]:
df = df[["Status", "Guest name", "Origin", "# of guests", "Booked", "Start date", "End date", "# of nights", "# of days pre booked", "Listing name", "# of beds", "Earnings", "Tax", "Earnings after Tax", "Advertiser"]]

Exporting data frame to excel file

In [17]:
exportPath = "./preprocessed files/"

df.to_excel(exportPath + "Final_Airbnb.xlsx")
df.head()

Unnamed: 0,Status,Guest name,Origin,# of guests,Booked,Start date,End date,# of nights,# of days pre booked,Listing name,# of beds,Earnings,Tax,Earnings after Tax,Advertiser
0,Confirmed,Egor Zhidkov,Russian Federation,2,2019-03-28,2019-06-20,2019-06-22,2,84,Studio with Patio,2,67.9,4,63.9,Airbnb
1,Confirmed,Youstina Daoud,Egypt,2,2019-03-30,2019-06-03,2019-06-05,2,65,Studio with Patio,2,67.9,4,63.9,Airbnb
2,Confirmed,Öznur Balaban,Turkey,2,2019-04-09,2019-06-08,2019-06-09,1,60,Studio with Patio,2,33.95,2,31.95,Airbnb
3,Confirmed,Михаил Кодолов,Russian Federation,2,2019-05-01,2019-06-16,2019-06-20,4,46,Studio with Patio,2,135.8,8,127.8,Airbnb
4,Confirmed,Janie Macpherson,United States,1,2019-05-02,2019-09-18,2019-09-23,5,139,Studio with Patio,2,169.75,5,164.75,Airbnb
