# Edinburgh AirBnB Property Analysis
#### Author: Pavel Khudov


In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.decomposition import PCA
from kneed import KneeLocator
import statsmodels.formula.api as smf

Import the dirty data and show the first 5 rows:

In [88]:
# Read data
dirty_listings_df = pd.read_csv('https://www.inf.ed.ac.uk/teaching/courses/fds/data/project-2022-2023/airbnb/listings.csv.gz')
dirty_listings_df.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,15420,https://www.airbnb.com/rooms/15420,20221216161317,2022-12-16,city scrape,Georgian Boutique Apt City Centre,"Stunning, spacious ground floor apartment minu...","The neighbourhood is in the historic New Town,...",https://a0.muscache.com/pictures/cf69631f-4194...,60423,...,4.98,4.98,4.91,,f,1,1,0,0,3.1
1,707097,https://www.airbnb.com/rooms/707097,20221216161317,2022-12-16,city scrape,Centre Royal Mile Apartment 3 bedrooms 2 bathr...,,"The location is the perfect for tourism , shor...",https://a0.muscache.com/pictures/6e2ded8d-f20b...,3092851,...,4.72,4.94,4.09,,t,8,6,2,0,0.55
2,728199,https://www.airbnb.com/rooms/728199,20221216161317,2022-12-16,city scrape,"Private room in central, spacious and comfy flat",Fantastic main door flat over two levels withi...,"Great location for access to the city centre, ...",https://a0.muscache.com/pictures/11315577/0091...,3776412,...,4.97,4.73,4.79,,f,1,0,1,0,2.86
3,732008,https://www.airbnb.com/rooms/732008,20221216161317,2022-12-16,city scrape,51 18 Caledonian Crescent,This beautiful third floor apartment is set in...,,https://a0.muscache.com/pictures/prohost-api/H...,3810279,...,4.81,4.71,4.7,,t,2,2,0,0,2.27
4,744710,https://www.airbnb.com/rooms/744710,20221216161317,2022-12-16,city scrape,Refurbished Flat in a Georgian Era Building in...,A stunning apartment in the heart of Edinburgh...,The apartment is in a Central Edinburgh neighb...,https://a0.muscache.com/pictures/monet/Select-...,3737047,...,4.96,4.96,4.86,,t,1,1,0,0,1.73


We start cleaning the data by deleting the columns that appear irrelevant, such as the urls, ids, scrape data, etc. We also delete the neighbourhood column, because it is the same for all properties - Edinburgh. Because our research question is based on the features of the property, we will not take into account the information about the host, event though it might be relevant. Neither we will consider the information about the property that is not a feature, such as maximum nights availabilty, etc. Nevertheless, we will leave the ratings, as they are indicative of cleanliness, location etc.

In [89]:
#Delete all meaningless columns, like links, ids, etc.
listings_df = dirty_listings_df.drop(["id", "listing_url", "scrape_id", "last_scraped", "source" , "picture_url", "host_id", "host_url", "host_thumbnail_url", "host_picture_url", "calendar_last_scraped"], axis=1)

#Neighbourhood is the same everywhere, so we can delete it
listings_df = listings_df.drop(["neighbourhood"], axis=1)

#Description, neighbourhood overview and host_about is a text, and we assume that name of the host is not relevant, so we can delete it. The location of the host could be relevant,however because we don't hava data about the relative location of all these locations, we won't get accurate results. Therfore we delete it as well.
listings_df = listings_df.drop(["description", "neighborhood_overview", "host_name", "host_about", "host_location", 'host_since', 'host_response_time', 'host_acceptance_rate', 'host_is_superhost', 'host_neighbourhood',  'host_response_rate', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'minimum_nights', 'minimum_minimum_nights', 'minimum_maximum_nights', 'maximum_nights', 'maximum_minimum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability', 'availability_365', 'availability_30', 'availability_60', 'availability_90','number_of_reviews','number_of_reviews_l30d','number_of_reviews_ltm', 'first_review', 'last_review','instant_bookable','calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month'], axis=1)

#The name of the property is not relevant to the analysis we are doing, so we can delete it. We still have indexes to identify the property.
listings_df = listings_df.drop(["name"], axis=1)


listings_df.head()

Unnamed: 0,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,...,price,calendar_updated,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license
0,"Old Town, Princes Street and Leith Street",,55.95759,-3.18805,Entire rental unit,Entire home/apt,2,,1 bath,1.0,...,$114.00,,4.97,4.98,4.96,4.97,4.98,4.98,4.91,
1,"Old Town, Princes Street and Leith Street",,55.9498,-3.18354,Entire condo,Entire home/apt,9,,2 baths,3.0,...,$598.00,,4.38,4.32,4.4,4.69,4.72,4.94,4.09,
2,Polwarth,,55.93692,-3.22414,Private room in rental unit,Private room,3,,1 private bath,1.0,...,$60.00,,4.82,4.86,4.81,4.95,4.97,4.73,4.79,
3,Dalry and Fountainbridge,,55.94192,-3.21878,Entire rental unit,Entire home/apt,10,,2 baths,4.0,...,$162.00,,4.74,4.81,4.8,4.84,4.81,4.71,4.7,
4,Deans Village,,55.95324,-3.20368,Entire rental unit,Entire home/apt,4,,1 bath,2.0,...,$138.00,,4.96,4.95,4.96,4.93,4.96,4.96,4.86,


There are a lot of missing values in the dataset. We can delete all columns with more than 20% of missing values.

In [90]:
#Delete all columns with more than 20% of missing values
listings_df = listings_df.dropna(thresh=len(listings_df) * 0.8, axis=1)
#Show the rest of missing values
listings_df.isna().sum()

neighbourhood_cleansed           0
latitude                         0
longitude                        0
property_type                    0
room_type                        0
accommodates                     0
bathrooms_text                  13
bedrooms                       138
beds                           105
amenities                        0
price                            0
review_scores_rating           665
review_scores_accuracy         678
review_scores_cleanliness      678
review_scores_checkin          678
review_scores_communication    678
review_scores_location         678
review_scores_value            678
dtype: int64

We convert those columns associated with time to corresponding datatype.

In [None]:
#Convert all dates to datetime
listings_df["host_since"] = pd.to_datetime(listings_df["host_since"])
listings_df["first_review"] = pd.to_datetime(listings_df["first_review"])
listings_df["last_review"] = pd.to_datetime(listings_df["last_review"])
listings_df.info()

The variables that we want to predict are the ones related with the score, so we delete those rows where their values are missing. It is not fatal because there are not that many.

In [None]:
#Delete all rows where reviews are missing, because that is the variable we want to predict
#listings_df = listings_df.dropna(subset=["review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication", "review_scores_location", "review_scores_value"])
#Reset the index
listings_df = listings_df.reset_index(drop=True)
listings_df.isna().sum()

Because there are still some values missing, we replace them with the mean in the case of dates, and with the most frequency value in the case of other variables.  TODO: TO JUSTIFY

In [None]:
#Replace all missing values in the columns of type date with the average of the dates, and reviews as well
columns_to_fill = ["host_since", "first_review", "last_review", "review_scores_rating", "review_scores_accuracy", "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication", "review_scores_location", "review_scores_value"]
listings_df[columns_to_fill] = listings_df[columns_to_fill].fillna(listings_df[columns_to_fill].mean())

#Replace all the rest of missing values with the most frequent value in the column
listings_df = listings_df.fillna(listings_df.mode().iloc[0])

listings_df.isna().sum()

To be able to analyse the data, we have to convert some of the variables to the correct type. We also convert the string representation of a list to a string of comma-separated values.

In [None]:
#Convert t and f to True and False
listings_df = listings_df.replace({"t": True, "f": False})

#Convert all percentages to float
listings_df["host_acceptance_rate"] = listings_df["host_acceptance_rate"].str.replace('%', '').astype('float') / 100.0
#Convert all prices to float
listings_df["price"] = listings_df["price"].str.replace('$', '').str.replace(',', '').astype('float')
#Convert number of beds and bedrooms to int
listings_df["beds"] = listings_df["beds"].astype('int')
listings_df["bedrooms"] = listings_df["bedrooms"].astype('int')

#Convert string representation of a list to string of comma-separated values
listings_df["host_verifications"] = listings_df["host_verifications"].str.replace('[', '').str.replace(']', '').str.replace("'", '')


listings_df.head()

We look at all the possible values of and see that it can be split into two columns, one indicating whether the bathroom is shared or not, and the other one indicating the number of bathrooms.

In [None]:
listings_df["bathrooms_text"].value_counts()

In [None]:

#Create a column indicating whether a bathroom is shared or private
listings_df.insert(14, "shared_bathroom",listings_df["bathrooms_text"].str.contains("shared"))

#Extract the number of bathrooms using regular expression
pattern = re.compile(r'\d+')
#If no number is found, assume 1 bathroom
listings_df["bathrooms_text"] = listings_df["bathrooms_text"].apply(lambda x: int(pattern.search(x).group()) if pattern.search(x) else 1)
listings_df.rename(columns={"bathrooms_text": "bathrooms"}, inplace=True)




listings_df.head()

Initially, the idea was to separate amenities column into dummy variables and include them into analysis. Nevertheless, there are so many values that applying regular expressions to group them seemed impossible in the amount of time. Therefore, the decision has been made to drop the column entirely and not include it into analysis.

In [None]:
#Temporary
listings_df["amenities"] = listings_df["amenities"].str.replace('[', '').str.replace(']', '').str.replace('"', '')
amenities_expanded = listings_df['amenities'].str.get_dummies(sep=', ')
print(len(amenities_expanded.columns.values.tolist()))
amenities_expanded.columns.values.tolist()


In [None]:
listings_df = listings_df.drop(["amenities"], axis=1)

listings_df.columns.tolist()

With the rest of the columns, including the ones with comma separated values, we can create dummy variables for each of the values, because they are relatively clean and small in number.

In [None]:
host_verifications_expanded = listings_df['host_verifications'].str.get_dummies(sep=', ')
# host_locations_expanded = listings_df['host_location'].str.get_dummies()
neighborhood_expanded = listings_df['neighbourhood_cleansed'].str.get_dummies()
property_type_expanded = listings_df['property_type'].str.get_dummies()
room_type_expanded = listings_df['room_type'].str.get_dummies()

#Once separated, we drop them
listings_df = listings_df.drop(["host_verifications", "neighbourhood_cleansed", "property_type", "room_type"], axis=1)


#Join the dummy variable columns with the original dataframe
#listings_df = pd.concat([listings_df, host_verifications_expanded, host_locations_expanded, neighborhood_expanded, property_type_expanded, room_type_expanded], axis=1)
listings_df = pd.concat([listings_df, host_verifications_expanded, neighborhood_expanded, property_type_expanded, room_type_expanded], axis=1)


listings_df.head()

There are two columns that have similar information: "property_type" and "room_type". In some of them, the information is repeated, for example in both columns the value is "Private room". We group them, maintaining the largest value so that it is true if any of the values is true.

In [None]:
#Merge columns with the same name keeping the largest value
listings_df = listings_df.groupby(listings_df.columns, axis=1).max()
listings_df.info()


In [None]:

#listings_df.drop(["host_since", "first_review", "last_review"], axis=1, inplace=True)

### PCA Analysis
Once the data is all cleaned, we see that there are too many dimensions to the dataset, so we perform Principal Component Analysis.

The first step is to standardize the data. The library function did not work on date data type, so we create our own function.

In [None]:

#Standardize the data, built-in function didn't work with dates
def standardize(df):
    return df.apply(lambda x: (x-x.mean())/x.std())



standardized = standardize(listings_df)
standardized.head()
#print(listings_df.columns.values)
#print(standardize(listings_df).columns.values)
#for i in range(len(listings_df.columns.values.tolist())):
     #if listings_df.columns.values.tolist()[i] != standardized.columns.values.tolist()[i]:
         #print(listings_df.columns.values.tolist()[i]+ " != " + standardized.columns.values.tolist()[i])range(len(listings_df.columns.values.tolist()))

We apply PCA to standardized data

In [None]:

pca = PCA().fit(standardized.values)
print(pca.explained_variance_) # Eigenvalues
print(pca.components_) # Eigenvector (C

We create a separate function that applies PCA and orders the PCs in the descending order.

In [None]:
def sorted_PCs(df):
    standardized = standardize(df)
    pca = PCA().fit(standardized.values) # number of principal components that we are interested in
    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_

    idx = eigenvalues.argsort()[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:,idx]
    for i in range(len(eigenvalues)):
        print("PC" + str(i+1) + " explains " + str((eigenvalues[i] / np.sum(eigenvalues))*100) + "% of the variance.")
    return eigenvalues, eigenvectors

sorted_PCs(listings_df)

Now we visualise the contribuition of each of the pcs to the variance in order to later find the elbow.

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
eigenvalues, eigenvectors = sorted_PCs(listings_df)
percentExplained = eigenvalues / np.sum(eigenvalues)*100
plt.plot(percentExplained)

Using a function from "kneed" library, we find the knee in the plot. That will indicate the number of principal components that is optimal.

In [None]:

kneedle = KneeLocator(range(1, len(eigenvalues)+1), percentExplained, curve="convex", direction="decreasing")
print("The optimal number of principal components is " + str(kneedle.knee) + ".")
kneedle.plot_knee_normalized()


It is 24, so we visualise the weights of each of the features in the table:

In [None]:
pc_table = pd.DataFrame(eigenvectors[:24].T, columns=["PC" + str(i+1) for i in range(len(eigenvalues[:24]))], index=listings_df.columns.values.tolist())
pc_table

In [None]:
from IPython.display import display, HTML

css = """
.output {
    flex-direction: row;
}
"""

HTML('<style>{}</style>'.format(css))

#Largest absolute value in the PC
display(pc_table.loc[pc_table["PC1"].abs().nlargest(10).keys()]["PC1"])
display(pc_table.loc[pc_table["PC2"].abs().nlargest(10).keys()]["PC2"])
display(pc_table.loc[pc_table["PC3"].abs().nlargest(10).keys()]["PC3"])
display(pc_table.loc[pc_table["PC4"].abs().nlargest(10).keys()]["PC4"])
display(pc_table.loc[pc_table["PC5"].abs().nlargest(10).keys()]["PC5"])
display(pc_table.loc[pc_table["PC6"].abs().nlargest(10).keys()]["PC6"])
display(pc_table.loc[pc_table["PC7"].abs().nlargest(10).keys()]["PC7"])
display(pc_table.loc[pc_table["PC8"].abs().nlargest(10).keys()]["PC8"])
display(pc_table.loc[pc_table["PC9"].abs().nlargest(10).keys()]["PC9"])
display(pc_table.loc[pc_table["PC10"].abs().nlargest(10).keys()]["PC10"])
display(pc_table.loc[pc_table["PC11"].abs().nlargest(10).keys()]["PC11"])
display(pc_table.loc[pc_table["PC12"].abs().nlargest(10).keys()]["PC12"])
display(pc_table.loc[pc_table["PC13"].abs().nlargest(10).keys()]["PC13"])
display(pc_table.loc[pc_table["PC14"].abs().nlargest(10).keys()]["PC14"])
display(pc_table.loc[pc_table["PC15"].abs().nlargest(10).keys()]["PC15"])
display(pc_table.loc[pc_table["PC16"].abs().nlargest(10).keys()]["PC16"])
display(pc_table.loc[pc_table["PC17"].abs().nlargest(10).keys()]["PC17"])
display(pc_table.loc[pc_table["PC18"].abs().nlargest(10).keys()]["PC18"])
display(pc_table.loc[pc_table["PC19"].abs().nlargest(10).keys()]["PC19"])
display(pc_table.loc[pc_table["PC20"].abs().nlargest(10).keys()]["PC20"])
display(pc_table.loc[pc_table["PC21"].abs().nlargest(10).keys()]["PC21"])
display(pc_table.loc[pc_table["PC22"].abs().nlargest(10).keys()]["PC22"])
display(pc_table.loc[pc_table["PC23"].abs().nlargest(10).keys()]["PC23"])
display(pc_table.loc[pc_table["PC24"].abs().nlargest(10).keys()]["PC24"])

Associated with rating

Associated with the features of the property

In [None]:
#Apply PCA to the data
pca = PCA(n_components=2)
pca.fit(standardized.values)
pca_data = pca.transform(standardized.values)
pca_df = pd.DataFrame(pca_data, columns=["PC1", "PC2"])
pca_df.head()


In [None]:
sns.scatterplot(x=pca_data[:, 1], y=pca_data[:, 0])
plt.xlabel('PC2')
plt.ylabel('PC1')

In [None]:
model = smf.ols('PC1 ~ PC2', data=pca_df)
results = model.fit()

In [None]:
sns.scatterplot(data=pca_df, y='PC1', x='PC2')
y_hat = results.predict(pca_df["PC2"])
sns.lineplot(x=pca_df['PC2'], y=y_hat)

In [None]:
results.summary()