In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import feature_selection
import matplotlib.pyplot as plt
import xgboost
import ast
import operator
import math
%matplotlib inline

In [9]:
listings_df = pd.read_csv('Data/nyc_listings.csv')
listings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36923 entries, 0 to 36922
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            36923 non-null  int64  
 1   listing_url                                   36923 non-null  object 
 2   scrape_id                                     36923 non-null  float64
 3   last_scraped                                  36923 non-null  object 
 4   name                                          36910 non-null  object 
 5   description                                   35710 non-null  object 
 6   neighborhood_overview                         22510 non-null  object 
 7   picture_url                                   36923 non-null  object 
 8   host_id                                       36923 non-null  int64  
 9   host_url                                      36923 non-null 

In [10]:
# Removing dollar sign from price field
listings_df['price'] = listings_df['price'].str.replace("[$, ]", "").astype("float")

  listings_df['price'] = listings_df['price'].str.replace("[$, ]", "").astype("float")


In [None]:
reviews_df = pd.read_csv('Data/nyc_reviews.csv')

# Convert the date to datetime
reviews_df['date'] = pd.to_datetime(reviews_df['date'], format='%Y-%m-%d')
  
# Filter data between two dates
reviews_df = reviews_df.loc[(reviews_df['date'] >= '2019-01-01')
                     & (reviews_df['date'] < '2021-12-31')]
reviews_df.head()

In [11]:
# Rename id to listing_id for joining tables
listings_df.rename(columns={'id':'listing_id'}, inplace=True)

In [None]:
# Calculating revenue based on reviews
# Idea from Jingles Airbnb Analysis

booking_df = pd.merge(reviews_df, listings_df, on='listing_id')
booking_df['calculated_revenue'] = booking_df['price'] * booking_df['minimum_nights']

In [None]:
revenue_listing_df = booking_df[['listing_id','calculated_revenue']].groupby(['listing_id']).sum()

In [None]:
listings_df = pd.merge(listings_df, revenue_listing_df, on='listing_id', how='left')
listings_df.at[listings_df['calculated_revenue'].isnull(), 'calculated_revenue'] = 0

In [None]:
listings_df[['listing_id','number_of_reviews','minimum_nights','price','bedrooms','beds','calculated_revenue']].sort_values('calculated_revenue',ascending=False).head(5)


In [None]:
pd.set_option('display.max_rows', None)
listings_df[['listing_id','price']].groupby(['price']).count().sort_values('price', ascending=False)

In [None]:
high_price_df = listings_df[listings_df['price'] >= 7000]

high_price_df[['listing_id', 'listing_url', 'accommodates', 'number_of_reviews', 'price', 'calculated_revenue']]

In [None]:
# Filter out price greater than 10000
listings_df = listings_df[listings_df['price'] < 7000]

In [None]:
listings_df[['listing_id','listing_url','number_of_reviews','minimum_nights','price','bedrooms','beds','calculated_revenue']].sort_values('calculated_revenue',ascending=False).head(5)


In [None]:
listings_df[['listing_id','minimum_nights']].groupby(['minimum_nights']).count().sort_values('minimum_nights', ascending=False)

In [None]:
# Filter out min nights greater than 500
listings_df = listings_df[listings_df['minimum_nights'] <= 500]

In [12]:
listings_df.drop(['listing_id'], axis = 1, inplace = True)
listings_df.drop(['host_id'], axis = 1, inplace = True)
listings_df.drop(['listing_url'], axis = 1, inplace = True)
listings_df.drop(['scrape_id'], axis = 1, inplace = True)
listings_df.drop(['last_scraped'], axis = 1, inplace = True)
listings_df.drop(['name'], axis = 1, inplace = True)
listings_df.drop(['description'], axis = 1, inplace = True) # don't have the resources/time/knowledge to extract anything from this
listings_df.drop(['neighborhood_overview'], axis = 1, inplace = True)
listings_df.drop(['picture_url'], axis = 1, inplace = True)
listings_df.drop(['host_url'], axis = 1, inplace = True)
listings_df.drop(['host_name'], axis = 1, inplace = True)
listings_df.drop(['host_since'], axis = 1, inplace = True)
listings_df.drop(['host_location'], axis = 1, inplace = True)
listings_df.drop(['host_about'], axis = 1, inplace = True)
listings_df.drop(['host_response_time'], axis = 1, inplace = True)
listings_df.drop(['host_response_rate'], axis = 1, inplace = True)
listings_df.drop(['host_acceptance_rate'], axis = 1, inplace = True)
listings_df.drop(['host_thumbnail_url'], axis = 1, inplace = True)
listings_df.drop(['host_picture_url'], axis = 1, inplace = True)
listings_df.drop(['host_neighbourhood'], axis = 1, inplace = True)
listings_df.drop(['neighbourhood'], axis = 1, inplace = True)
listings_df.drop(['latitude'], axis = 1, inplace = True)
listings_df.drop(['longitude'], axis = 1, inplace = True)
listings_df.drop(['bathrooms'], axis = 1, inplace = True) # entire column missing
listings_df.drop(['minimum_minimum_nights'], axis = 1, inplace = True)
listings_df.drop(['maximum_minimum_nights'], axis = 1, inplace = True)
listings_df.drop(['minimum_maximum_nights'], axis = 1, inplace = True)
listings_df.drop(['maximum_maximum_nights'], axis = 1, inplace = True)
listings_df.drop(['calendar_updated'], axis = 1, inplace = True)
listings_df.drop(['calendar_last_scraped'], axis = 1, inplace = True)
listings_df.drop(['first_review'], axis = 1, inplace = True)
listings_df.drop(['last_review'], axis = 1, inplace = True)
listings_df.drop(['review_scores_rating'], axis = 1, inplace = True)
listings_df.drop(['review_scores_accuracy'], axis = 1, inplace = True)
listings_df.drop(['review_scores_cleanliness'], axis = 1, inplace = True)
listings_df.drop(['review_scores_checkin'], axis = 1, inplace = True)
listings_df.drop(['review_scores_communication'], axis = 1, inplace = True)
listings_df.drop(['review_scores_location'], axis = 1, inplace = True)
listings_df.drop(['review_scores_value'], axis = 1, inplace = True)
listings_df.drop(['reviews_per_month'], axis = 1, inplace = True)
listings_df.drop(['license'], axis = 1, inplace = True)

# this may be able to be parse in the same way that the amenities column is
listings_df.drop(['host_verifications'], axis = 1, inplace = True)


In [None]:
# Parse and count amenities
a = listings_df['amenities']

counts = dict()
for amlist in a:
   res = ast.literal_eval(amlist)
   for i in res:
      counts[i] = counts.get(i, 0) + 1

sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

In [14]:
b = pd.DataFrame()
b['Wifi'] = 0
b['Long term stays allowed'] = 0
b['Heating'] = 0
b['Kitchen'] = 0
b['Essentials'] = 0
b['Smoke alarm'] = 0
b['Air conditioning'] = 0
b['Hangers'] = 0
b['Carbon monoxide alarm'] = 0
b['Hair dryer'] = 0
b['Dedicated workspace'] = 0
b['Iron'] = 0
b['Shampoo'] = 0
b['Hot water'] = 0
b['TV'] = 0
b['Dishes and silverware'] = 0
b['Refrigerator'] = 0
b['Cooking basics'] = 0
b['Microwave'] = 0
b['Fire extinguisher'] = 0
b['Washer'] = 0
b['Dryer'] = 0
b['Stove'] = 0
b['Oven'] = 0
b['Coffee maker'] = 0
b['Free street parking'] = 0
b['Bed linens'] = 0
b['First aid kit'] = 0
b['Extra pillows and blankets'] = 0
b['Private entrance'] = 0
b['Lock on bedroom door'] = 0
b['Elevator'] = 0
b['Cable TV'] = 0
b['TV with standard cable'] = 0
b['Dishwasher'] = 0
b['Luggage dropoff allowed'] = 0
b['Security cameras on property'] = 0
b['Bathtub'] = 0
b['Lockbox'] = 0
b['Host greets you'] = 0
b['Freezer'] = 0
b['Shower gel'] = 0
b['Gym'] = 0
b['Laundromat nearby'] = 0
b['Patio or balcony'] = 0
b['Cleaning products'] = 0
b['Free parking on premises'] = 0
b['Backyard'] = 0
b['Room-darkening shades'] = 0
b['Body soap'] = 0

In [15]:
i = 0
for amlist in listings_df['amenities']:
   res = ast.literal_eval(amlist)
   if 'Wifi' in res:
      b.at[i, 'Wifi'] = 1
   if 'Long term stays allowed' in res:
      b.at[i, 'Long term stays allowed'] = 1
   if 'Heating' in res:
      b.at[i, 'Heating'] = 1
   if 'Kitchen' in res:
      b.at[i, 'Kitchen'] = 1
   if 'Essentials' in res:
      b.at[i, 'Essentials'] = 1
   if 'Smoke alarm' in res:
      b.at[i, 'Smoke alarm'] = 1
   if 'Air conditioning' in res:
      b.at[i, 'Air conditioning'] = 1
   if 'Hangers' in res:
      b.at[i, 'Hangers'] = 1
   if 'Carbon monoxide alarm' in res:
      b.at[i, 'Carbon monoxide alarm'] = 1
   if 'Hair dryer' in res:
      b.at[i, 'Hair dryer'] = 1
   if 'Dedicated workspace' in res:
      b.at[i, 'Dedicated workspace'] = 1
   if 'Iron' in res:
      b.at[i, 'Iron'] = 1
   if 'Shampoo' in res:
      b.at[i, 'Shampoo'] = 1
   if 'Hot water' in res:
      b.at[i, 'Hot water'] = 1
   if 'TV' in res:
      b.at[i, 'TV'] = 1
   if 'Dishes and silverware' in res:
      b.at[i, 'Dishes and silverware'] = 1
   if 'Refrigerator' in res:
      b.at[i, 'Refrigerator'] = 1
   if 'Cooking basics' in res:
      b.at[i, 'Cooking basics'] = 1
   if 'Microwave' in res:
      b.at[i, 'Microwave'] = 1
   if 'Fire extinguisher' in res:
      b.at[i, 'Fire extinguisher'] = 1
   if 'Washer' in res:
      b.at[i, 'Washer'] = 1
   if 'Dryer' in res:
      b.at[i, 'Dryer'] = 1
   if 'Stove' in res:
      b.at[i, 'Stove'] = 1
   if 'Oven' in res:
      b.at[i, 'Oven'] = 1
   if 'Coffee maker' in res:
      b.at[i, 'Coffee maker'] = 1
   if 'Free street parking' in res:
      b.at[i, 'Free street parking'] = 1
   if 'Bed linens' in res:
      b.at[i, 'Bed linens'] = 1
   if 'First aid kit' in res:
      b.at[i, 'First aid kit'] = 1
   if 'Extra pillows and blankets' in res:
      b.at[i, 'Extra pillows and blankets'] = 1
   if 'Private entrance' in res:
      b.at[i, 'Private entrance'] = 1
   if 'Lock on bedroom door' in res:
      b.at[i, 'Lock on bedroom door'] = 1
   if 'Elevator' in res:
      b.at[i, 'Elevator'] = 1
   if 'Cable TV' in res:
      b.at[i, 'Cable TV'] = 1
   if 'TV with standard cable' in res:
      b.at[i, 'TV with standard cable'] = 1
   if 'Dishwasher' in res:
      b.at[i, 'Dishwasher'] = 1
   if 'Luggage dropoff allowed' in res:
      b.at[i, 'Luggage dropoff allowed'] = 1
   if 'Security cameras on property' in res:
      b.at[i, 'Security cameras on property'] = 1
   if 'Bathtub' in res:
      b.at[i, 'Bathtub'] = 1
   if 'Lockbox' in res:
      b.at[i, 'Lockbox'] = 1
   if 'Host greets you' in res:
      b.at[i, 'Host greets you'] = 1
   if 'Freezer' in res:
      b.at[i, 'Freezer'] = 1
   if 'Shower gel' in res:
      b.at[i, 'Shower gel'] = 1
   if 'Gym' in res:
      b.at[i, 'Gym'] = 1
   if 'Laundromat nearby' in res:
      b.at[i, 'Laundromat nearby'] = 1
   if 'Patio or balcony' in res:
      b.at[i, 'Patio or balcony'] = 1
   if 'Cleaning products' in res:
      b.at[i, 'Cleaning products'] = 1
   if 'Free parking on premises' in res:
      b.at[i, 'Free parking on premises'] = 1
   if 'Backyard' in res:
      b.at[i, 'Backyard'] = 1
   if 'Room-darkening shades' in res:
      b.at[i, 'Room-darkening shades'] = 1
   if 'Body soap' in res:
      b.at[i, 'Body soap'] = 1
   i += 1

b = b.fillna(0)

In [None]:
# find out how korosh is parsing through amenities
listings_df.drop(['amenities'], axis = 1, inplace = True)

In [None]:
listings_df['has_availability'] = listings_df['has_availability'].map({'t' : 1, 'f' : 0})
listings_df['instant_bookable'] = listings_df['instant_bookable'].map({'t' : 1, 'f' : 0})
listings_df['host_is_superhost'] = listings_df['host_is_superhost'].map({'t' : 1, 'f' : 0})

listings_df['host_has_profile_pic'] = listings_df['host_has_profile_pic'].map({'t' : 1, 'f' : 0})
listings_df['host_identity_verified'] = listings_df['host_identity_verified'].map({'t' : 1, 'f' : 0})

listings_df['bathrooms_text'] = pd.Categorical(listings_df['bathrooms_text'])
listings_df['bathrooms_text'] = listings_df['bathrooms_text'].cat.codes

listings_df['neighbourhood_cleansed'] = pd.Categorical(listings_df['neighbourhood_cleansed'])
listings_df['neighbourhood_cleansed'] = listings_df['neighbourhood_cleansed'].cat.codes

listings_df['neighbourhood_group_cleansed'] = pd.Categorical(listings_df['neighbourhood_group_cleansed'])
listings_df['neighbourhood_group_cleansed'] = listings_df['neighbourhood_group_cleansed'].cat.codes

listings_df['property_type'] = pd.Categorical(listings_df['property_type'])
listings_df['property_type'] = listings_df['property_type'].cat.codes

listings_df['room_type'] = pd.Categorical(listings_df['room_type'])
listings_df['room_type'] = listings_df['room_type'].cat.codes

In [None]:
# trying to fill null/na values
# interpolate vs ffill
listings_df['minimum_nights_avg_ntm'].fillna(method = "ffill", inplace = True)
listings_df['maximum_nights_avg_ntm'].fillna(method = "ffill", inplace = True)

# assuming one bed per bedroom to fill in missing stuff in bedrooms column
listings_df['bedrooms'].fillna(listings_df['beds'], inplace=True)

#dropping any other rows with missing values
listings_df.dropna(inplace = True)

In [None]:
listings_df.corr()
plt.rcParams['figure.figsize'] = [25, 30]
sns.heatmap(listings_df.corr(), vmin=listings_df.corr().values.min(), vmax=1, cmap="YlGnBu", square=True, 
        linewidths=0.1, annot=True, annot_kws={"size":8})  

In [None]:
X = listings_df.iloc[:,:-1]
y = listings_df['calculated_revenue']

In [None]:
# X = feature_selection.SelectKBest(k=10).fit_transform(X,y)
# X.shape

In [None]:
# Split data into 90/10 Train/Test ratio
print("---------- Splitting Data 9:1 Train:Test-----------------")
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.1)

# Specifying the model to use
xg = xgboost.XGBRegressor(eval_metric='rmse')


# Model fitting i.e., creating the model
print("------------------- Fitting Model ----------------------")
xg.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# Make predictions
print("------------------ Making Predictions ------------------")
expected = y_test

xgpredicted = xg.predict(X_test)

In [None]:
# Print performance metrics
print("Model: XGBoost")
print(xg)
print('MAE: ' + str(metrics.mean_absolute_error(y_test, xgpredicted)))
print('RMSE: ' + str(math.sqrt(metrics.mean_squared_error(y_test, xgpredicted))))
print('\n')

In [None]:
xg_reg_mae = cross_val_score(xg, X, y, cv=10,scoring='neg_mean_absolute_error')
xg_reg_rmse = cross_val_score(xg, X, y, cv=10,scoring='neg_root_mean_squared_error')

print("XGBoost Regressor")
print("Mean Absolute Error = {}".format(np.mean(xg_reg_mae)*-1))
print("Root Mean Squared Error = {}".format(np.mean(xg_reg_rmse)*-1))