# Airbnb Price recommendations

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Data preparation

1. Remove all listings without a review
2. Remove all listings where price is > $500 or 0
3. Split up amenities column
4. Dummify amenities
5. Create train/test csv files

In [2]:
# Load data from the pre-cleaned listings-cleaned.csv file
airbnb = pd.read_csv("Data/listings-cleaned.csv")
airbnb.head()

Unnamed: 0,id,hos2_is_superhos2,host_listings_count,neighbourhood_cleansed,neighbourhood_group_cleansed,city,zipcode,property_type,room_type,accommodates,...,maximum_nights_avg_ntm,number_of_reviews,number_of_reviews_ltm,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2595,1.0,6.0,Midtown,Manhattan,New York,10018,Apartment,Entire home/apt,1,...,1125.0,48,7,94.0,9.0,9.0,10.0,10.0,10.0,9.0
1,3831,1.0,1.0,Clinton Hill,Brooklyn,Brooklyn,11238,Guest suite,Entire home/apt,3,...,730.0,295,75,90.0,9.0,9.0,10.0,9.0,10.0,9.0
2,5099,1.0,1.0,Murray Hill,Manhattan,New York,10016,Apartment,Entire home/apt,2,...,21.0,78,8,90.0,10.0,9.0,10.0,10.0,10.0,9.0
3,5121,1.0,1.0,Bedford-Stuyvesant,Brooklyn,Brooklyn,11216,Apartment,Private room,2,...,730.0,49,0,90.0,8.0,8.0,10.0,10.0,9.0,9.0
4,5178,1.0,1.0,Hell's Kitchen,Manhattan,New York,10019,Apartment,Private room,2,...,14.0,454,47,84.0,9.0,7.0,9.0,9.0,10.0,8.0


In [6]:
# total number listings
airbnb['id'].count()

29839

### Remove all listings without a review in a last 12 months (number_of_reviews_ltm)

In [7]:
# check that number_of_reviews_ltm column matches
airbnb['number_of_reviews_ltm'].count()

29839

In [8]:
# replace 0's with NaN, drop all NaNs
airbnb['number_of_reviews_ltm'] = airbnb['number_of_reviews_ltm'].replace(0, pd.np.nan)
airbnb = airbnb.dropna(axis=0, how='any', subset=['number_of_reviews_ltm'])

# remaining number of rows
airbnb['id'].count()

29839

### Remove listings with price = 0 or > $500

In [9]:
# remove $ sign and commas from price
airbnb['price'] = airbnb['price'].str.replace('$','').str.replace(',','').astype('float')
airbnb.price

0        225.0
1         89.0
2        200.0
4         79.0
6        150.0
         ...  
50473    100.0
50500    120.0
50567    150.0
50577     50.0
50580     60.0
Name: price, Length: 29839, dtype: float64

In [24]:
# check max price
max(airbnb.price)

500.0

In [11]:
# remove prices = 0
# replace 0's with NaN, drop all NaNs
airbnb['price'] = airbnb['price'].replace(0, pd.np.nan)
airbnb = airbnb.dropna(axis=0, how='any', subset=['price'])

# remaining number of rows
airbnb['price'].count()

29826

In [12]:
airbnb = airbnb[airbnb['price'] <= 500]

# max price
max(airbnb.price)

500.0

In [13]:
# remaining number of rows
airbnb['price'].count()

29351

### Dummify necessary columns

### Create train/test splits

In [18]:
y = airbnb['price']
X = airbnb.drop('price', 1)


In [19]:
# create train and test sets, seed = 123
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size = 0.7, random_state = 123)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(20545, 37) (20545,)
(8806, 37) (8806,)


In [28]:
# export csv files for train and test
train = pd.concat([y_train, X_train], axis=1)
train.to_csv('Data/train.csv', index = False)

test = pd.concat([y_test, X_test], axis=1)
test.to_csv('Data/test.csv', index = False)

## Data exploration

## Modeling

1. Latent class
2. Random forest


## Assumptions:

1. Use daily prices, as this is the most complete 