In [1]:
import pandas as pd

In [2]:
pd.options.display.max_columns = None

In [3]:
listings_df = pd.read_parquet('../data/raw/airbnb_listings_v1.0.parquet')
print(listings_df.shape)
listings_df.head()

(494954, 47)


Unnamed: 0,ID,Name,Host ID,Host Since,Host Response Time,Host Response Rate,Host Acceptance Rate,Host Listings Count,Host Total Listings Count,Neighbourhood,City,State,Zipcode,Country,Property Type,Room Type,Accommodates,Bathrooms,Bedrooms,Beds,Bed Type,Amenities,Square Feet,Price,Weekly Price,Monthly Price,Security Deposit,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Availability 30,Availability 60,Availability 90,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Cancellation Policy,Reviews per Month,Features
0,4008728,"Luxurious 3 bedroom, centrum, 180m2",20786453,2014-09-01,within a day,50.0,,1.0,1.0,Oud-West,Amsterdam,North Holland,1054 AA,Netherlands,Apartment,Entire home/apt,6.0,1.0,3.0,3.0,Real Bed,"TV,Internet,Wireless Internet,Kitchen,Pets all...",,600.0,,6000.0,500.0,50.0,4.0,50.0,2.0,1125.0,14.0,44.0,74.0,74.0,31.0,89.0,9.0,8.0,9.0,10.0,10.0,9.0,strict,1.52,"Host Has Profile Pic,Host Identity Verified,Is..."
1,7778612,Luxury apartment in city centre,11964927,2014-02-05,within a few hours,100.0,,1.0,1.0,Oud-West,Amsterdam,Noord-Holland,1053,Netherlands,Apartment,Entire home/apt,4.0,1.0,2.0,2.0,Real Bed,"TV,Cable TV,Internet,Wireless Internet,Kitchen...",,175.0,,4000.0,400.0,40.0,2.0,30.0,2.0,1125.0,0.0,0.0,0.0,259.0,15.0,99.0,9.0,9.0,9.0,10.0,10.0,9.0,strict,0.81,"Host Is Superhost,Host Has Profile Pic,Host Id..."
2,8264596,Cosy apartment across Vondelpark,23669273,2014-11-12,,,,1.0,1.0,Oud-West,Amsterdam,Noord-Holland,1054,Netherlands,Apartment,Entire home/apt,4.0,1.0,2.0,2.0,Real Bed,"TV,Cable TV,Internet,Wireless Internet,Kitchen...",,125.0,600.0,,,,1.0,0.0,4.0,1125.0,0.0,0.0,0.0,0.0,1.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,flexible,0.05,"Host Has Profile Pic,Host Identity Verified,Is..."
3,2180729,Spacious City Apartment Oud-West,9238680,2013-10-05,within a day,100.0,,1.0,1.0,Oud-West,Amsterdam,Noord-Holland,1052 RT,Netherlands,Apartment,Entire home/apt,4.0,1.0,2.0,4.0,Real Bed,"TV,Cable TV,Internet,Wireless Internet,Kitchen...",,130.0,,,100.0,45.0,2.0,25.0,3.0,30.0,0.0,0.0,0.0,0.0,22.0,97.0,10.0,9.0,10.0,9.0,9.0,9.0,flexible,0.62,"Host Has Profile Pic,Host Identity Verified,Is..."
4,14463171,Cosy Studio Apartment Center Amsterdam,89112644,2016-08-10,within a day,100.0,,1.0,1.0,,Amsterdam,Noord-Holland,1053,Netherlands,Apartment,Entire home/apt,2.0,1.0,0.0,1.0,Real Bed,"TV,Wireless Internet,Kitchen,Heating,Smoke det...",,80.0,,,100.0,25.0,1.0,0.0,2.0,1125.0,2.0,21.0,51.0,326.0,16.0,78.0,8.0,8.0,8.0,8.0,9.0,9.0,moderate,2.04,Host Has Profile Pic


### Filter Data to Only Include Top 4 English Speaking Countries

In [4]:
tc = ['United States', 'United Kingdom', 'Australia', 'Canada']

listings_df = listings_df[listings_df['Country'].isin(tc)]
listings_df.shape

(267066, 47)

## Data Cleaning

In [5]:
# Lower case
cols = ['City', 'State', 'Zipcode', 'Country', 'Property Type']
for col in cols:
    listings_df[col] = listings_df[col].apply(lambda x: x.upper() if type(x) == str else x)

### Missing Values

In [6]:
def get_missing_values_pct(df):
    tmp = df.isnull().sum() * 100 / len(df)
    tmp_df = pd.DataFrame({'col': df.columns, 'percent': tmp})
    tmp_df.sort_values('percent', inplace=True, ascending=False)
    return tmp_df

get_missing_values_pct(listings_df)

Unnamed: 0,col,percent
Square Feet,Square Feet,98.422113
Host Acceptance Rate,Host Acceptance Rate,87.466394
Weekly Price,Weekly Price,81.190792
Monthly Price,Monthly Price,80.351673
Security Deposit,Security Deposit,57.197472
Cleaning Fee,Cleaning Fee,33.42245
Review Scores Value,Review Scores Value,27.022534
Review Scores Checkin,Review Scores Checkin,27.013922
Review Scores Location,Review Scores Location,27.006807
Review Scores Accuracy,Review Scores Accuracy,26.947271


There are some columns that has high percentage of missing values. So, neither dropping rows nor
computing those values will be good idea. So, we are going to drop those columns.

Neighbourhood can be an important feature, however, about 27% of the data has missing values. So, we
will use Zip Code to group neighborhoods and will drop Neighbourhood column.

Different 'Review Scores' are also missing values. We can impute these values by using the
mean review scores for that listing.

For the remaining columns with missing values, we are going to drop the rows since it's a very
small amount.

In [7]:
# Drop columns with high percentage of missing values.

columns_to_drop = [
    'Square Feet',
    'Host Acceptance Rate',
    'Weekly Price',
    'Monthly Price',
    'Security Deposit',
    'Cleaning Fee',
    'Neighbourhood',
]

listings_df.drop(columns_to_drop, axis=1, inplace=True)
listings_df.shape

(267066, 40)

In [8]:
# Impute missing values for Review Scores

review_scores = [
    'Review Scores Value',
    'Review Scores Checkin',
    'Review Scores Location',
    'Review Scores Accuracy',
    'Review Scores Communication',
    'Review Scores Cleanliness',
    'Review Scores Rating',
    'Reviews per Month',
    'Host Response Rate',
]

for i in review_scores:
    listings_df[i].fillna(listings_df[i].mean(), inplace=True)

listings_df.shape

(267066, 40)

In [9]:
# Fill in missing Host Response Time with None
listings_df['Host Response Time'].fillna('None', inplace=True)

# Fill in missing State with None
listings_df['State'].fillna('None', inplace=True)

# Now drop rows with missing values
listings_df.dropna(inplace=True)

listings_df.shape

(256286, 40)

In [10]:
listings_df.isnull().sum().sort_values(ascending=False)

ID                             0
Name                           0
Extra People                   0
Minimum Nights                 0
Maximum Nights                 0
Availability 30                0
Availability 60                0
Availability 90                0
Availability 365               0
Number of Reviews              0
Review Scores Rating           0
Review Scores Accuracy         0
Review Scores Cleanliness      0
Review Scores Checkin          0
Review Scores Communication    0
Review Scores Location         0
Review Scores Value            0
Cancellation Policy            0
Reviews per Month              0
Guests Included                0
Price                          0
Amenities                      0
State                          0
Host ID                        0
Host Since                     0
Host Response Time             0
Host Response Rate             0
Host Listings Count            0
Host Total Listings Count      0
City                           0
Zipcode   

## Feature Extraction

In [11]:
from sklearn.preprocessing import OrdinalEncoder
from category_encoders import TargetEncoder

In [12]:
# Host Response Time
cat = ['None', 'a few days or more', 'within a day', 'within a few hours', 'within an hour']
enc = OrdinalEncoder(categories=[cat])
listings_df['Host Response Time'] = enc.fit_transform(listings_df['Host Response Time'].to_numpy().reshape(-1, 1))

In [13]:
# Room Type
cat = ['Entire home/apt', 'Private room', 'Shared room']
enc = OrdinalEncoder(categories=[cat])
listings_df['Room Type'] = enc.fit_transform(listings_df['Room Type'].to_numpy().reshape(-1, 1))

In [14]:
# Bed Type
cat = ['Real Bed', 'Pull-out Sofa', 'Futon', 'Couch', 'Airbed']
enc = OrdinalEncoder(categories=[cat])
listings_df['Bed Type'] = enc.fit_transform(listings_df['Bed Type'].to_numpy().reshape(-1, 1))

In [15]:
# Cancellation Policy
cat = [
    'flexible',
    'moderate',
    'strict',
    'long_term',
    'super_strict_30',
    'super_strict_60',
    'no_refunds'
]
enc = OrdinalEncoder(categories=[cat])
listings_df['Cancellation Policy'] = enc.fit_transform(listings_df['Cancellation Policy'].to_numpy().reshape(-1, 1))

In [16]:
# Amenities & Features

def split_feature(df, column):
    cat = set()
    for i in df[column]:
        cat.update([x.lower() for x in i.split(',')])
    
    new_df = df.reindex(columns=df.columns.tolist() + list(cat), fill_value=0)
    
    for i, v in enumerate(new_df[column]):
        for c in [x.lower() for x in v.split(',')]:
            col_i = new_df.columns.get_loc(c)
            new_df.iat[i, col_i] = 1
    
    new_df.drop(column, axis=1, inplace=True)
    return new_df

listings_df = split_feature(listings_df, 'Features')
listings_df = split_feature(listings_df, 'Amenities')

In [17]:
cols = ['City', 'State', 'Zipcode', 'Country', 'Property Type']
enc = TargetEncoder(cols=cols).fit(listings_df, listings_df['Price'])
listings_df = enc.transform(listings_df)



In [18]:
listings_df.shape

(256286, 157)

In [19]:
listings_df.head()

Unnamed: 0,ID,Name,Host ID,Host Since,Host Response Time,Host Response Rate,Host Listings Count,Host Total Listings Count,City,State,Zipcode,Country,Property Type,Room Type,Accommodates,Bathrooms,Bedrooms,Beds,Bed Type,Price,Guests Included,Extra People,Minimum Nights,Maximum Nights,Availability 30,Availability 60,Availability 90,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Cancellation Policy,Reviews per Month,host has profile pic,host is superhost,require guest phone verification,require guest profile picture,is location exact,host identity verified,requires license,instant bookable,flat smooth pathway to front door,disabled parking spot,washer / dryer,refrigerator,suitable for events,gym,pocket wifi,firm mattress,baby monitor,wireless internet,pool,smoking allowed,pack ’n play/travel crib,hot tub,firm matress,beach essentials,children’s books and toys,kitchen,table corner guards,path to entrance lit at night,bed linens,cleaning before checkout,oven,garden or backyard,tub with shower bench,essentials,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50,buzzer/wireless intercom,elevator in building,fireplace guards,private bathroom,family/kid friendly,other pet(s),luggage dropoff allowed,iron,free parking on street,lake access,long term stays allowed,paid parking off premises,stair gates,wide hallway clearance,dishwasher,patio or balcony,doorman,dog(s),wide doorway,waterfront,baby bath,room-darkening shades,accessible-height toilet,indoor fireplace,cooking basics,cable tv,accessible-height bed,heating,ethernet connection,changing table,pets live on this property,roll-in shower with shower bench or chair,24-hour check-in,dishes and silverware,step-free access,fire extinguisher,internet,pets allowed,carbon monoxide detector,crib,smoke detector,extra pillows and blankets,self check-in,high chair,grab-rails for shower and toilet,window guards,keypad,coffee maker,private entrance,beachfront,breakfast,hot water,children’s dinnerware,ev charger,wide clearance to bed,lockbox,game console,safety card,dryer,private living room,first aid kit,tv,wheelchair accessible,wide clearance to shower and toilet,smart lock,bbq grill,stove,hangers,free parking on premises,cat(s),outlet covers,microwave,air conditioning,laptop friendly workspace,washer,single level home,doorman entry,smartlock,babysitter recommendations,shampoo,lock on bedroom door,hair dryer,bathtub
533,15802051,Large double available now.,46296095,2015-10-11,3.0,100.0,2.0,2.0,96.468624,91.827826,137.588343,93.133659,159.950205,1.0,2.0,1.0,2.0,2.0,0.0,44.0,1.0,15.0,3.0,1125.0,29.0,59.0,89.0,364.0,3.0,93.0,9.0,9.0,10.0,9.0,10.0,10.0,2.0,0.83,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
534,14751869,Spacious 4BR home near Brixton and Clapham,11850512,2014-02-01,4.0,100.0,1.0,1.0,96.468624,91.827826,137.588343,93.133659,159.950205,0.0,8.0,1.5,4.0,4.0,0.0,139.0,1.0,0.0,3.0,1125.0,0.0,0.0,0.0,0.0,21.0,95.0,10.0,10.0,9.0,10.0,9.0,10.0,1.0,3.73,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0
535,8901485,quirky room with a view,12594665,2014-02-26,0.0,93.61078,1.0,1.0,96.468624,95.810108,133.840543,93.133659,125.697277,1.0,2.0,1.0,1.0,1.0,0.0,70.0,1.0,0.0,1.0,1125.0,0.0,0.0,0.0,0.0,0.0,93.504051,9.563006,9.353298,9.73085,9.747439,9.513625,9.389506,0.0,1.589557,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
536,5832248,Lovely 2r flat in the ❤ of Brixton,30273453,2015-03-30,0.0,93.61078,1.0,1.0,96.468624,93.965739,80.50119,93.133659,125.697277,0.0,6.0,1.0,2.0,4.0,0.0,70.0,1.0,12.0,3.0,1125.0,0.0,0.0,0.0,0.0,2.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,2.0,0.11,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0
537,8049784,Lovely bedroom in a cosy house,33670376,2015-05-18,2.0,100.0,1.0,1.0,96.468624,85.532099,58.449479,93.133659,159.950205,1.0,2.0,1.0,1.0,1.0,0.0,25.0,1.0,15.0,1.0,1125.0,29.0,59.0,89.0,359.0,3.0,93.0,9.0,9.0,10.0,10.0,9.0,9.0,1.0,0.18,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0


## Data Splitting

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [21]:
# Split data into features and labels
list_X = listings_df.drop(columns=['ID', 'Name', 'Host ID', 'Host Since', 'Price'], axis=1)
list_y = listings_df['Price']

# Split the dataset into training (60%), validation (20%), and test (20%) sets
X_dev, X_test, y_dev, y_test = train_test_split(list_X, list_y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size=0.2, random_state=0)

In [22]:
# Standardize the columns
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)