In [571]:
import sys
!{sys.executable} -m pip install sklearn
!{sys.executable} -m pip install lime
# requiered packages for modelling
!{sys.executable} -m pip install xgboost

In [577]:
# DATA
import pandas as pd
import numpy as np
# DATA VISUALIZATION / PLOTTING
import matplotlib.pyplot as plt
import seaborn as sns
# STATISTICAL TESTS
import scipy
from scipy import stats
import statsmodels.formula.api as smf
import statsmodels.api as sm
# MACHINE LEARNING
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection, preprocessing, feature_selection, ensemble, linear_model, metrics, decomposition
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score
import xgboost as xgb
from xgboost import plot_importance
import time
# MODEL EXPLAINABILITY
from lime import lime_tabular

%matplotlib inline


### Importing data

In [2]:
# Reading data 
df_raw = pd.read_csv('/Users/marina/Desktop/airbnb/data/listings 2.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# To display the top 5 rows 
df_raw.head(5) 

In [3]:
df_raw.tail(5)  

In [4]:
df_raw.dtypes

In [5]:
# REMOVING IRRELEVANT COLUMNS
to_drop = ["listing_url",
           "scrape_id",
           "name",
           "description", 
           "neighborhood_overview",
           "picture_url",
           "host_id", 
           "host_url", 
           "host_name", 
           "host_location", 
           "host_about",
           "host_thumbnail_url",
           "host_picture_url", 
           "host_neighbourhood",
           "host_verifications"]

df = df_raw.drop(to_drop, axis = 1)
df.shape

#### Checking Duplicate Rows

In [6]:
duplicate_rows_df = df[df.duplicated()]
print('number of duplicate rows: ', duplicate_rows_df.shape)

### Cleaning Data

In [7]:
print(df.isnull().sum())

Removing neighbourhood because there's too many missing values while neighbourhood_cleansed doesn't have any. Other missing columns also removed.

In [8]:
#DROPPING MISSING COLUMNS
df = df.drop(['neighbourhood','neighbourhood_group_cleansed', 'bathrooms', 'calendar_updated', 'license'], axis=1)
df.shape

host_listings_count and host_total_listings_count are the same and have same NaN cases. So one of these columns can be dropped.

In [9]:
print(sum((df.host_listings_count == df.host_total_listings_count) == False))
df.loc[((df.host_listings_count == df.host_total_listings_count) == False)][:5]

last_scraped and calendar_last_scraped are the equal. So one of these columns can be dropped.

In [12]:
#DUPPLICATED COLUMN 
print(sum((df.last_scraped == df.calendar_last_scraped) == False))

In [13]:
df = df.drop(['calendar_last_scraped', 'host_total_listings_count'], axis=1)
df.shape

In [14]:
# Replacing columns with f/t with 0/1
df.replace({'f': 0, 't': 1}, inplace=True)
df.hist(figsize=(20,20));

# Descriptive table
df.describe()

### Cleaning individual columns

#### Bathrooms
In earlier versions of the site, Airbnb provided a number, now they provide bathrooms_text (a textual description for the number of bathrooms) . 
I need to parse the field bathrooms_text to get the number of bathrooms and I also created a new field shared_bathroom (if the bathroom is shared)

In [460]:
# Removing Null values in bathroom_text 
abnb = df.dropna(subset=['bathrooms_text'])
abnb['bathrooms_text'].isnull().sum()

In [461]:
word = abnb['bathrooms_text'].map(lambda x: x.split(' ')[0])
bathrooms = pd.to_numeric(word, errors='coerce')
bathrooms = bathrooms.fillna(0.5)

In [462]:
abnb.loc[:,'bathrooms'] = bathrooms
abnb.loc[:,'text'] = abnb['bathrooms_text'].str.lower()
abnb.loc[abnb['text'].str.contains('shared'), 'shared_bathroom'] = 1
abnb.loc[:,'shared_bathroom'] = abnb['shared_bathroom'].fillna(0).astype('int64')
abnb = abnb.drop(['text'], axis=1)
abnb.head(100)
#teste = teste[df["shared_bathroom"] == 1]
#teste

In [None]:
# abnb['bathrooms'].value_counts(dropna=False)
# df = df.drop(['ar_condicionado', 'stove', 'tv'], axis=1)
# df.head(5)
abnb.shape

In [463]:
#pd.crosstab(abnb.bathrooms, abnb.shared_bathroom)
abnb.hist(column= ['bathrooms', 'shared_bathroom'], figsize=(15,5), bins=5);

##### Price

In [464]:
abnb.price = abnb.price.str[1:-3]
abnb.price = abnb.price.str.replace(',', '')
abnb.price = abnb.price.astype('int64')

#### Amenities

In [None]:
# abnb.amenities[:1].values

In [398]:
# List of all amenities
amenities_lst = list(abnb.amenities.str.lower())
amenities_lst_str = " ".join(amenities_lst)
amenities_lst_str = amenities_lst_str.replace('[', '')
amenities_lst_str = amenities_lst_str.replace(']', ',')
amenities_lst_str = amenities_lst_str.replace('"', '')

amenities_set = [x.strip() for x in amenities_lst_str.split(',')]
amenities_set = set(amenities_set)
amenities_set

In [23]:
abnb.shape

In [465]:
abnb.loc[abnb['amenities'].str.lower().str.contains('ar condicionado|air conditioning|heating'), 'air_conditioning'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('tv'), 'tv'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('netflix|prime video|apple tv|chromecast|alexa|game console'), 'streaming'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('premium cable|cable tv|hbo max'), 'cable_tv'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('stove|oven|microwave'), 'stove_oven'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('sound system'), 'sound_system'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('bathtub|hot tub'), 'bathtub'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('refrigerator|consul duo|freezer|fridge|refrigerador'), 'refrigerator'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('building staff'), 'building_staff'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('baby|0-2 years|crib|children'), 'child_friendly'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('closet'), 'closet'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('wardrobe'), 'wardrobe'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('bbq'), 'bbq'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('beachfront|waterfront'), 'beachfront'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('bikes'), 'bikes'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('breakfast'), 'breakfast'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('beach essentials'), 'beach_essentials'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('cleaning before checkout'), 'cleaning_before_checkout'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('coffee|nespresso'), 'coffee'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('workspace|desk|monitor'), 'workspace'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('bathroom essentials|soap|conditioner|sabonete|shampoo|dove|francis|johnson|lux|pantene|shower gel'), 'bath_essentials'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('kitchen|dishwasher|full kitchen'), 'kitchen'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('dryer|washer|iron'), 'laundry'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('elevator'), 'elevator'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('bed linens|bedroom comforts|extra pillows'), 'bed_comforts'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('ev charger'), 'ev_charger'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('fire|smoke alarm'), 'fire_equip'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('parking|garage'), 'parking'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('garden|backyard|patio'), 'garden'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('gym'), 'gym'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('wifi'), 'wifi'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('pool|sauna'), 'pool'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('self check-in'), 'self_check_in'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('Suitable for events'), 'event_suitable'] = 1
abnb.loc[abnb['amenities'].str.lower().str.contains('safe|guards|smart lock'), 'security'] = 1
abnb.shape # created 35 new columns
#abnb.head(100)

In [466]:
#Columns created starts at column 54
abnb.iloc[:5, 54:] 

In [467]:
# Replacing nulls with zeros for new columns
#replace_nulls = abnb.iloc[:,54:].columns
replace_nulls = abnb.iloc[:,44:].columns
abnb[replace_nulls] = abnb[replace_nulls].fillna(0)

# Produces a list of amenity features where one category (true or false) contains fewer than 10% of listings
infrequent_amenities = []
for col in abnb.iloc[:,54:].columns:
    if abnb[col].sum() < len(abnb)/10:
        infrequent_amenities.append(col)
print(infrequent_amenities)

# Dropping infrequent amenity features
abnb.drop(infrequent_amenities, axis=1, inplace=True)

In [468]:
# Dropping the original amenity feature
abnb.drop('amenities', axis=1, inplace=True)

In [469]:
abnb.iloc[:5, 53:]

#### host_has_profile_pic

In [404]:
abnb['host_has_profile_pic'].value_counts().plot(kind='bar')
plt.title('Host has profile pic')
plt.xlabel('Profile pic')
plt.ylabel('Count')
sns.despine

Removing host_has_profile_pic because contain only one category.

In [405]:
abnb.drop('host_has_profile_pic', axis=1, inplace=True)
abnb.shape

#### Host_since
Calculating how long have been a host (difference between host_since and date of last_scraped)

In [471]:
# Converting to datetime
abnb.last_scraped = pd.to_datetime(abnb.last_scraped) 
abnb.host_since = pd.to_datetime(abnb.host_since) 

# Calculating how many days have been a host
abnb['host_age_days'] = (abnb.last_scraped - abnb.host_since).astype('timedelta64[D]')

# Replacing the 23 null values with the median 
abnb.host_age_days.fillna(abnb.host_age_days.median(), inplace=True)

In [472]:
abnb.host_age_days.hist(figsize=(10,5), bins=20);
print("Mean host_age_days:", round(abnb['host_age_days'].mean(),0))
print("Median host_age_days:", abnb['host_age_days'].median())

#### Host_response_time

In [412]:
abnb['host_response_time'].value_counts(dropna=False, normalize=True)
abnb.host_response_time.fillna("missing", inplace=True)
abnb.host_response_time.value_counts(normalize=True)

In [413]:
abnb.host_response_time.hist(figsize=(10,5), bins=9);

#### host_response_rate

In [513]:
#abnb.host_response_rate = abnb.host_response_rate.str[:-1].astype('float64')
abnb.host_response_rate.hist(figsize=(10,5), bins=9);
round(abnb['host_response_rate'].describe())

#### host_acceptance_rate

In [476]:
#Removing '%' from host_acceptance_rate
abnb.host_acceptance_rate = abnb.host_acceptance_rate.str[:-1].astype('float64')
round(abnb['host_acceptance_rate'].describe())

#### host_is_superhost

In [477]:
#Checking NaN cases
len(abnb[abnb.loc[ :,['host_is_superhost'] ].isna().sum(axis=1) == 1])

In [155]:
abnb[abnb['host_is_superhost'].isna()]

In [478]:
#Deleting NaN rows
abnb.dropna(subset=['host_is_superhost'], inplace=True)
abnb.shape

#### host_listings_count

In [325]:
#abnb['host_listings_count'].value_counts(dropna=False) # freq
abnb.host_listings_count.hist(figsize=(10,5), bins=10, range=(0,100));
#sns.displot(abnb, x="host_listings_count")
#print(round(abnb['host_listings_count'].describe()))

#quantiles
print("mean: ", round(abnb['host_listings_count'].mean(),0))
print("median: ",round(abnb.host_listings_count.quantile(q=0.5)))
print("quantile 0.97: ",round(abnb.host_listings_count.quantile(q=0.97)))
print("quantile 0.98: ",round(abnb.host_listings_count.quantile(q=0.98)))
print("quantile 0.99: ",round(abnb.host_listings_count.quantile(q=0.99)))
print("quantile 0.99: ",round(abnb.host_listings_count.quantile(q=0.99)))

#### host_identity_verified

In [158]:
abnb.host_identity_verified.hist(figsize=(10,5), bins=5);

#### neighbourhood_cleansed

In [421]:
abnb['neighbourhood_cleansed'].value_counts(dropna=False)

#### property_type

In [422]:
abnb['property_type'].value_counts(dropna=False)

In [479]:
# categories that are types of houses or apartments
#abnb = abnb.drop('property_type_new', axis=1)
abnb.loc[abnb['property_type'].str.lower().str.contains('house|bungalow|lodge|cottage|villa|chalet|cabin|condominium|farm'), 'property_type_new'] = 'house'
abnb.loc[abnb['property_type'].str.lower().str.contains('apartment|loft'), 'property_type_new'] = 'apartment'
abnb.loc[:,'property_type_new'] = abnb.property_type_new.fillna('other')

abnb['property_type_new'].value_counts(dropna=False)


In [549]:
abnb.drop(['property_type'], axis=1)

#### room_type

In [480]:
abnb['room_type'].value_counts(dropna=False)

#### accommodates

In [163]:
#abnb['accommodates'].value_counts(dropna=False)
abnb.hist(column= ['accommodates'], figsize=(10,5), bins=10);

#### bedrooms and beds

In [426]:
abnb.hist(column= ['bedrooms'], figsize=(10,5), bins=10, range=(0,20));
print(round(abnb['bedrooms'].describe()))
#print(abnb['bedrooms'].value_counts(dropna=False) # NaN=1735)

In [429]:
abnb.hist(column= ['beds'], figsize=(10,5), bins=10, range=(0,30));
print("Missings: ", abnb['beds'].isnull().sum())
print(round(abnb['beds'].describe()))
#abnb['beds'].value_counts(dropna=False) # NaN=224

#### minimum and maximum nights columns 

In [166]:
abnb2 = abnb.filter(like='nights')
# Heatmap
sns.heatmap(abnb2.corr(), vmin=-1., vmax=1., annot=True, fmt='.2f', cmap="magma", cbar=True, linewidths=0.5)

#plt.savefig('heatmap.png')
plt.show()

Because of high correlation and easiest explanation of the features, just minimum_nights and maximum_nights will be retained.

In [481]:
abnb.drop(['minimum_minimum_nights',
           'maximum_minimum_nights', 
           'minimum_maximum_nights',
           'maximum_maximum_nights',
           'minimum_nights_avg_ntm',
           'maximum_nights_avg_ntm'], axis=1, inplace=True)

#### has_availability
Aproximatelly 97% of total cases in just one category. Removing this feature.

In [439]:
abnb.hist(column= ['has_availability'], figsize=(10,5), bins=5);
print(abnb['has_availability'].value_counts(normalize=True, dropna=False))

In [482]:
abnb.drop(['has_availability'], axis=1, inplace=True)

#### availability

In [169]:
abnb.hist(column= ['availability_30', 
                   'availability_60', 
                   'availability_90', 
                   'availability_365'], figsize=(10,5), bins=10);

In [170]:
abnb2 = abnb.filter(like='availability_')
# Heatmap
sns.heatmap(abnb2.corr(), vmin=-1., vmax=1., annot=True, fmt='.2f', cmap='magma', cbar=True, linewidths=0.5)

#plt.savefig('heatmap.png')
plt.show()

Availability measures are high correlated. Only availability_365 will be retained.

In [483]:
abnb.drop(['availability_30', 
           'availability_60',
           'availability_90'], axis=1, inplace=True)

#### number_of_reviews

In [172]:
abnb.hist(column= ['number_of_reviews', 
                   'number_of_reviews_ltm',
                   'number_of_reviews_l30d'], figsize=(10,5), bins=10);

#### first_review

In [484]:
abnb.first_review = pd.to_datetime(abnb.first_review) 

abnb['days_first_review'] = (abnb.last_scraped - abnb.first_review).astype('timedelta64[D]')
abnb.days_first_review.hist(figsize=(15,5), bins=30);

In [485]:
abnb['days_first_review'] = pd.cut(abnb['days_first_review'], 
                                   bins=[0, 360, 720, 1440, max(abnb.days_first_review)], 
                                   include_lowest=True, 
                                   labels=['0-12 months',  
                                           '12-24 months', 
                                           '24-48 months', 
                                           '48+ months']).astype('str')

replace_values = {'nan': 'no reviews'}

abnb = abnb.replace({'days_first_review': replace_values})       
abnb.days_first_review.value_counts(normalize=True)

#### last_review

In [487]:
abnb.last_review = pd.to_datetime(abnb.last_review) 

abnb['days_last_review'] = (abnb.last_scraped - abnb.last_review).astype('timedelta64[D]')
abnb.days_last_review.hist(figsize=(15,5), bins=30);

In [488]:
abnb['days_last_review'] = pd.cut(abnb['days_last_review'], 
                                      bins=[0, 90, 360, 720, max(abnb.days_last_review)], 
                                      include_lowest=True, 
                                      labels=['0-3 months',
                                              '3-12 months',
                                              '12-24 months',
                                              '24+ months']).astype('str')

replace_values = {'nan': 'no reviews'}

abnb = abnb.replace({'days_last_review': replace_values})       
abnb.days_last_review.value_counts(normalize=True)

#### reviews_per_month

In [452]:
abnb.hist(column= ['reviews_per_month'], figsize=(10,5), bins=10, range=(0,10));

### review_scores

In [180]:
abnb.hist(column= ['review_scores_rating', 
                   'review_scores_accuracy',
                   'review_scores_cleanliness', 
                   'review_scores_checkin', 
                   'review_scores_communication', 
                   'review_scores_location', 
                   'review_scores_value'], figsize=(15,10), bins=10);

In [612]:
#review_scores = list(abnb.columns[abnb.columns.str.startswith('review_scores') == True])
#review_scores.pop(0)
#print(review_scores)

#abnb.review_scores_rating = abnb.review_scores_rating.astype('float64')
print(abnb.review_scores_rating.value_counts(dropna=False, normalize=True))
abnb.review_scores_rating.hist(figsize=(10,5), bins=10);

In [489]:
review_scores = list(abnb.columns[abnb.columns.str.startswith('review_scores') == True])
review_scores.pop(0)

def bin_column(col, bins, labels, nan_label='missing'):
  # Takes in a column name, bin cut points and labels, replaces the original column with a binned version, 
  # and replaces NaN (with 'missing' if unspecified).
    abnb[col] = pd.cut(abnb[col], bins=bins, labels=labels, include_lowest=True)
    abnb[col] = abnb[col].astype('str')
    abnb[col].fillna(nan_label, inplace=True)
    
for col in review_scores:
    bin_column(col,
               bins=[0, 8, 9, 10],
               labels=['0-8', '9', '10'],
               nan_label='no reviews')

    
bin_column('review_scores_rating', 
           bins=[0, 90, 95, 100],
           labels=['0-89', '90-94', '95-100'], 
           nan_label='no reviews')



#### instant_bookable

In [189]:
abnb.instant_bookable.value_counts(dropna=False, normalize=True)

In [364]:
sns.catplot(x="instant_bookable", y="price",
            kind="bar", data=abnb)

#### calculated host_listings_count

In [190]:
abnb.hist(column= ['calculated_host_listings_count', 
                   'calculated_host_listings_count_entire_homes',
                   'calculated_host_listings_count_private_rooms', 
                   'calculated_host_listings_count_shared_rooms'], figsize=(10,5), bins=10);

In [326]:
abnb2 = abnb.filter(like='host_listings_count')
# Heatmap
sns.heatmap(abnb2.corr(), vmin=-1., vmax=1., annot=True, fmt='.2f', cmap='magma', cbar=True, linewidths=0.5)

#plt.savefig('heatmap.png')
plt.show()

In [490]:
# Removing columns because of high correlation in host_listings columns
abnb.drop(['calculated_host_listings_count_entire_homes', 
           'calculated_host_listings_count_private_rooms'], axis=1, inplace=True)

## Exploratory Data Analysis


Price has a large range values (high variance (stdˆ2)), and many outliers as boxplot shows.

In [245]:
round(abnb['price'].describe())

In [201]:
sns.boxplot(x=abnb['price'])

In [280]:
# Distribution plot
plt.figure(figsize=(20,4))
abnb.price.hist(bins=100, range=(0,5000))
plt.margins(x=0)
plt.axvline(100, color='red', linestyle='--')
plt.axvline(300, color='red', linestyle='--')
plt.axvline(550, color='red', linestyle='--')
plt.title("Price Distribution", fontsize=16)
plt.xlabel("Price (R$)", fontsize = 14)
plt.ylabel("Number of listings", fontsize = 14)
plt.show()

In [276]:
plt.figure(figsize=(20,4))
abnb.price.hist(bins=100, range=(20,100))
plt.margins(x=0)
plt.title("Price Distribution", fontsize=16)
plt.xlabel("Price (R$)", fontsize = 14)
plt.ylabel("Number of listings", fontsize = 14)
plt.show()

In [294]:
print("quantile 0.97: ",round(abnb.price.quantile(q=0.97)))
print("quantile 0.98: ",round(abnb.price.quantile(q=0.98)))
print("quantile 0.99: ",round(abnb.price.quantile(q=0.99)))

In [295]:
plt.figure(figsize=(20,4))
abnb.price.hist(bins=100, range=(1000,20000))
plt.margins(x=0)
plt.axvline(round(abnb.price.quantile(q=0.97)), color='red', linestyle='--')
plt.axvline(round(abnb.price.quantile(q=0.98)), color='red', linestyle='--')
plt.axvline(round(abnb.price.quantile(q=0.99)), color='red', linestyle='--')
plt.title("Price Distribution", fontsize=16)
plt.xlabel("Price (R$)", fontsize = 14)
plt.ylabel("Number of listings", fontsize = 14)
plt.show()

In [491]:
# Replacing values over quantile 0.99 to quantile 0.99
abnb.loc[abnb.price >= 5680, 'price'] = 5680

In [302]:
sns.relplot(x="number_of_reviews", y="price", data=abnb);

#### Latitude and Longitude

In [296]:
sns.jointplot(data=abnb, x="latitude", y="longitude")

## Amenities features

Amenities that adds value to properties:
- Air conditioning
- TV
- Coffee machines
- Fire equipment
- Gym
- Pool

Amenities that devaluate properties:
- Stove and microwave oven
- Coffee machine
- Refrigerator
- Bed comforts

In [361]:
# Plots a simple bar chart of the counts of true and false categories in the column specified, next to a 
# bar chart of the median price for each category. A figure size can optionally be specified.

def binary_count_and_price_plot(col, figsize=(8,3)):

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
    fig.suptitle(col, fontsize=16, y=1)
    plt.subplots_adjust(top=0.80) # So that the suptitle does not overlap with the ax plot titles
    
    abnb.groupby(col).size().plot(kind='bar', ax=ax1, color=['crimson', 'g'])
    ax1.set_xticklabels(labels=['0', '1'], rotation=0)
    ax1.set_title('Category count')
    ax1.set_xlabel('')
    
    abnb.groupby(col).price.median().plot(kind='bar', ax=ax2, color=['crimson', 'g'])
    ax2.set_xticklabels(labels=['lower', 'upper'], rotation=0)
    ax2.set_title('Median price')
    ax2.set_xlabel('')
    
    plt.show()
    
for col in abnb.iloc[:,43:].columns:
    binary_count_and_price_plot(col, figsize=(6,2))

In [493]:
corr_matrix = abnb.corr()
corr_matrix = abs(corr_matrix)
f, ax = plt.subplots(figsize=(20, 9))
cmap = sns.color_palette("coolwarm", as_cmap=True)
sns.heatmap(corr_matrix, cmap=cmap, center=0, linewidths=.5, vmin=0.05, 
            vmax=corr_matrix[corr_matrix != 1.0].max().max(), cbar_kws={"shrink": .5});

In [494]:
corr_matrix['price'].sort_values(ascending=False)

In [496]:
# Most correlated with price features, just the first amenities features selected 
# because of high correlation between then
abnb2 = abnb[['bathrooms',
              'bedrooms',
              'accommodates',
              'beds',
              'pool',
              'stove_oven',
              'refrigerator',
              'shared_bathroom',
              'latitude',
              'number_of_reviews_ltm',
              'bed_comforts',
              'reviews_per_month',
              'number_of_reviews',
              'coffee',
              'number_of_reviews_l30d',
              'parking',
              'longitude',
              'availability_365',
              'host_identity_verified',
              'price']]

corr_matrix = abnb2.corr()
corr_matrix = abs(corr_matrix)
f, ax = plt.subplots(figsize=(20, 9))
cmap = sns.color_palette("coolwarm", as_cmap=True)
sns.heatmap(corr_matrix, cmap=cmap, center=0, linewidths=.5, vmin=0.05, 
            vmax=corr_matrix[corr_matrix != 1.0].max().max(), cbar_kws={"shrink": .5}, annot=True);

High correlation between bathrooms, bedrooms, accommodates, beds. Removing:
- bedrooms: has more high correlations with other features.
- beds: has less correlation with Price than this other features.
- bathrooms: has higher correlations with other features than accomodates.

High correlation between amenities features stoven_oven, refrigerator, coffee, bed_comforts. Removing:
- stoven_oven: slightly more correlated with other features.
- refrigerator: slightly more correlated with other features.
- coffee: slightly more correlated with other features.

High correlation between features number_of_reviews_ltm, reviews_per_month, number_of_reviews, number_of_reviews_l30d. Removing:
- number_of_reviews_ltm: slightly more correlated with other features. 
- number_of_reviews_l30d: slightly more correlated with other features.
- number_of_reviews: slightly more correlated with other features.
Medium correlation between amenities features pool, parking. 

### Normalizing and Standardizing data


In [497]:
abnb.set_index('id', inplace=True) 

In [604]:
#Removing unselected features
model_df = abnb.drop(['bedrooms', 
                      'beds',
                      'bathrooms', 
                      'stove_oven',
                      'refrigerator',
                      'coffee',
                      'number_of_reviews_ltm',
                      'number_of_reviews_l30d',
                      'number_of_reviews',
                      'air_conditioning',
                      'gym',
                      'tv',
                      'cable_tv',
                      'wifi',
                      'garden',
                      'beachfront',
                      'workspace',
                      'elevator',
                      'kitchen',
                      'fire_equip',
                      'laundry',
                      'bath_essentials',
                      'neighbourhood_cleansed', 
                      'bathrooms_text', 
                      'property_type', 
                      'last_scraped', 'host_since', 'first_review', 'last_review', 
                      'calculated_host_listings_count',
                      'calculated_host_listings_count_shared_rooms'], axis=1)


Applying One-Hot-Encoding method, transforming 1 categorical column with n unique values into n-1 dummies

In [605]:
model_df = pd.get_dummies(model_df)

In [563]:
corr_matrix = model_df.corr()
corr_matrix = abs(corr_matrix)
f, ax = plt.subplots(figsize=(20, 9))
cmap = sns.color_palette("coolwarm", as_cmap=True)
sns.heatmap(corr_matrix, cmap=cmap, center=0, linewidths=.5, vmin=0.05, 
            vmax=corr_matrix[corr_matrix != 1.0].max().max(), cbar_kws={"shrink": .5});


In [607]:
num_cols = ['accommodates',
            'availability_365',  
            'host_age_days', 
            'host_listings_count', 
            'maximum_nights', 
            'minimum_nights', 
            'reviews_per_month', 
            'price']

# Log transforming columns
num_cols = [i for i in num_cols if i not in ['availability_365', 'host_age_days']] # Removing items not to be transformed

for col in num_cols:
    model_df[col] = model_df[col].astype('float64').replace(0.0, 0.01) # Replacing 0s with 0.01
    model_df[col] = np.log(model_df[col])
    
    model_df[col] = model_df[col].astype('float64').replace(0.0, 0.01) # Replacing 0s with 0.01
    model_df[col] = np.log(model_df[col])
    

In [599]:
model_df[num_cols].hist(figsize=(10,11));

In [608]:
X = model_df.drop('price', axis=1)
y = model_df.price

# Scaling
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=list(X.columns))

## Modelling

I wish I had more time to test more models, but as it was very close to the deadline, I was able to test only one model:

- Gradient Boosting method, with the XGBRegressor from the XGBoost library

Gradient-boosted decision trees are a machine learning technique for optimizing the predictive value of a model through successive steps in the learning process. Each iteration of the decision tree involves adjusting the values of the coefficients, weights, or biases applied to each of the input variables being used to predict the target value, with the goal of minimizing the loss function (the measure of difference between the predicted and actual target values). The gradient is the incremental adjustment made in each step of the process; boosting is a method of accelerating the improvement in predictive accuracy to a sufficiently optimum value.

The evaluation metrics used for loss will be mean squared error and for accuracy will be r-squared .

In [616]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [617]:
xgb_reg_start = time.time()

xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(X_train, y_train)
training_preds_xgb_reg = xgb_reg.predict(X_train)
val_preds_xgb_reg = xgb_reg.predict(X_test)

xgb_reg_end = time.time()

print(f"Time taken to run: {round((xgb_reg_end - xgb_reg_start)/60,1)} minutes")
print("\nTraining MSE:", round(mean_squared_error(y_train, training_preds_xgb_reg),4))
print("Validation MSE:", round(mean_squared_error(y_test, val_preds_xgb_reg),4))
print("\nTraining r2:", round(r2_score(y_train, training_preds_xgb_reg),4))
print("Validation r2:", round(r2_score(y_test, val_preds_xgb_reg),4))

In [None]:
ft_weights_xgb_reg = pd.DataFrame(xgb_reg.feature_importances_, columns=['weight'], index=X_train.columns)
ft_weights_xgb_reg.sort_values('weight', inplace=True)
ft_weights_xgb_reg

plt.figure(figsize=(8,20))
plt.barh(ft_weights_xgb_reg.index, ft_weights_xgb_reg.weight, align='center') 
plt.title("Feature importances in the XGBoost model", fontsize=14)
plt.xlabel("Feature importance")
plt.margins(y=0.01)
plt.show()