In [30]:
from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import feature_selection
import xgboost
import pandas as pd
import ast
import operator
import math

In [31]:
# Load up data
# Raw = ~37k rows
df1 = pd.read_csv('Data/nyc_listings.csv')
df2 = pd.read_csv('Data/nyc_revenue_listingid.csv')

df = pd.merge(df1, df2, on='id')

In [32]:
# Parse and count amenities
a = df['amenities']

counts = dict()
for amlist in a:
   res = ast.literal_eval(amlist)
   for i in res:
      counts[i] = counts.get(i, 0) + 1

sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)

textfile = open("Sorted Amenities.csv", "w")
for element in sorted_counts:
    textfile.write(element[0] + ", " + str(element[1]) + "\n")
textfile.close()

In [55]:
# Remove outliers and properties with no bed/bath or income data
dffinal = df[df['availability_365'] < 365]+df[df['availability_365'] > 0]
dffinal = dffinal[dffinal['bathrooms_text'].notnull()]
dffinal = dffinal[dffinal['bedrooms'].notnull()]
dffinal = dffinal[dffinal['Sum Price'].notnull()]

# Manual feature selection
dffinal = dffinal[[#'name', 
#'description',
#'neighborhood_overview', 
#'picture_url', 
#'host_id', 
#'host_url',
#'host_name', 
'host_since', 
'host_location', 
#'host_about',
'host_response_time', 
'host_response_rate', 
'host_acceptance_rate',
#'host_is_superhost', 
#'host_thumbnail_url', 
#'host_picture_url',
'host_neighbourhood', 
'host_listings_count',
'host_total_listings_count', 
#'host_verifications',
'host_has_profile_pic', 
'host_identity_verified', 
'neighbourhood',
'neighbourhood_cleansed', 
'neighbourhood_group_cleansed', 
'latitude',
'longitude', 
'property_type', 
'room_type', 
'accommodates', 
#'bathrooms',
'bathrooms_text', 
'bedrooms', 
'beds', 
'amenities', 
'price',
'minimum_nights', 
'maximum_nights', 
'minimum_minimum_nights',
'maximum_minimum_nights', 
'minimum_maximum_nights',
'maximum_maximum_nights', 
'minimum_nights_avg_ntm',
'maximum_nights_avg_ntm', 
#'calendar_updated', 
'has_availability',
'availability_30', 
'availability_60', 
'availability_90',
'availability_365', 
'calendar_last_scraped', 
'number_of_reviews',
'number_of_reviews_ltm', 
'number_of_reviews_l30d', 
#'first_review',
#'last_review', 
'review_scores_rating', 
'review_scores_accuracy',
'review_scores_cleanliness', 
'review_scores_checkin',
'review_scores_communication', 
'review_scores_location',
'review_scores_value', 
#'license', 
'instant_bookable',
'calculated_host_listings_count',
'calculated_host_listings_count_entire_homes',
'calculated_host_listings_count_private_rooms',
'calculated_host_listings_count_shared_rooms', 
'reviews_per_month', 
'Sum Price']]

# Drop any rows with null values
dffinal = dffinal.dropna()

dffinal.shape

(7196, 54)

In [56]:
# Use LabelEncoder to encode categorical data
le = preprocessing.LabelEncoder()
for col in dffinal.columns:
    if dffinal[col].dtype == object:
        dffinal[col] = le.fit_transform(dffinal[col])
    else:
        pass

# Assign data to X and target to Y variables
y = dffinal['Sum Price']
x = dffinal.iloc[:,:-1]

In [58]:
dffinal = feature_selection.SelectKBest(k=10).fit_transform(x,y)
dffinal.shape

  f = msb / msw


(7196, 10)

In [51]:
# Split data into 90/10 Train/Test ratio
print("---------- Splitting Data 9:1 Train:Test-----------------")
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.1)

# Specifying the model to use
xg = xgboost.XGBRegressor(eval_metric='rmse')


# Model fitting i.e., creating the model
print("------------------- Fitting Model ----------------------")
xg.fit(x_train, y_train)

---------- Splitting Data 9:1 Train:Test-----------------
------------------- Fitting Model ----------------------


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [52]:
# Make predictions
print("------------------ Making Predictions ------------------")
expected = y_test

xgpredicted = xg.predict(x_test)

------------------ Making Predictions ------------------


In [53]:
# Print performance metrics
print("Model: XGBoost")
print(xg)
print('MAE: ' + str(metrics.mean_absolute_error(y_test, xgpredicted)))
print('RMSE: ' + str(math.sqrt(metrics.mean_squared_error(y_test, xgpredicted))))
print('\n')

Model: XGBoost
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
MAE: 11528.359262212118
RMSE: 23059.4322329453




In [54]:
print("------------ Cross Validation Scores -----------------")
# Report cross validation scores (10 runs each)
xgscore = model_selection.cross_val_score(xg, x, y, cv=10)
score = sum(xgscore)/10
print('XG:' + str(score))

------------ Cross Validation Scores -----------------
XG:0.6744470723632416
