In [None]:
import pandas as pd
import numpy as np
import sklearn
import statistics
import seaborn as sns
import xgboost as xg

from numpy import mean, std, absolute
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.metrics import mean_squared_error as MSE
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from yellowbrick.regressor import prediction_error
from matplotlib import pyplot as plt
from xgboost import XGBRegressor

In [None]:
dataset = pd.read_pickle("dataset_final.pkl")
dataset = dataset.drop(columns=['minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights',
                               'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30',
                               'availability_60', 'availability_90', 'number_of_reviews_ltm', 
                                'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
                               'calculated_host_listings_count_shared_rooms', 'price']).copy()
dataset.rename(columns = {'id':'listing_id'}, inplace = True)



loading datasets for subquestions

In [None]:
#dataset sentiment scores
df_rt =  pd.read_pickle("agg_review_scores_translated.pkl")
final_dataset = pd.merge(dataset, df_rt, on='listing_id')

In [None]:
#dataset review recency
df_rr = pd.read_pickle("agg_weighted_rs.pkl")
final_dataset = pd.merge(dataset, df_rr, on='listing_id')

In [None]:
#dataset topic modelling where '9t' = num_topics
df_9t =  pd.read_pickle("topic_modelling_values_9t.pkl")
final_dataset = pd.merge(dataset, df_9t, on='listing_id')

In [None]:
#reset index
final_dataset.reset_index(drop=True, inplace=True)

In [None]:
#drop listing id column
df1 = final_dataset.drop(columns=['listing_id']).copy()

Train test split

In [None]:
#train test split data
X = df1.drop(columns=['log_price']).copy()
Y = df1['log_price'].copy()
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
#scale data
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)

Models: SVR

In [None]:
#hyperparameter tuning

model=SVR()
#tuned parameters
param_grid = {'C': [15,10,5,1],
              'gamma': [0.001, 0.01, 0.1, 0.0001, 'scale', 'auto'],
              'epsilon': [0.2, 0.1, 0.05, 0.01],
               'kernel': ['rbf', 'poly', 'linear']}

random_cv = RandomizedSearchCV(estimator=model,
            param_distributions=param_grid,
            cv=5, n_iter=50,
            scoring = 'r2', n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=1)

random_cv.fit(x_train,y_train)

random_cv.best_estimator_

random_cv.best_params_

In [None]:
#train model with best hyperparameters
svr_best=SVR(C=10, gamma='auto', epsilon=0.1)
svr_best.fit(x_train, y_train)
svr_pred = svr_best.predict(x_test)

mae = mean_absolute_error(y_test, svr_pred)
print("MAE : % f" %(mae))
rmse = np.sqrt(MSE(y_test, svr_pred))
print("RMSE : % f" %(rmse))
r2 = r2_score(y_test, svr_pred)
print("R2 : % f" %(r2))

Models: extreme gradient boosting

In [None]:
#hyperparameter tuning
model = XGBRegressor()

#tuned hyperparameters
n_estimators = [10, 50, 100, 300, 400, 500]
min_split_loss = [0, 0.2, 0.5]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear', 'dart']
learning_rate=[0.05,0.1, 0.3, 0.5]
min_child_weight=[1,2,3]
subsample =[0.5,0.7,1]
base_score=[0.25,0.5,1]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_split_loss':min_split_loss,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'subsample':subsample,
    'booster':booster,
    'base_score':base_score
    }


random_cv = RandomizedSearchCV(estimator=model,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'r2',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=1)

random_cv.fit(x_train,y_train)

random_cv.best_estimator_

In [None]:
#train XGBoost with best hyperparameters

best_xgbr= XGBRegressor(base_score=1, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0.200000003, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=5, min_child_weight=1, min_split_loss=0.2,
             monotone_constraints='()', n_estimators=500, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

best_xgbr.fit(x_train, y_train)
  

xgbr_pred = best_xgbr.predict(x_test)
mae = mean_absolute_error(y_test, xgbr_pred)
print("MAE : % f" %(mae))
rmse = np.sqrt(MSE(y_test, xgbr_pred))
print("RMSE : % f" %(rmse))
r2 = r2_score(y_test, xgbr_pred)
print("R2 : % f" %(r2))


models: ridge regression

In [None]:
#hyperparameter tuning
model = Ridge()

grid = dict()
grid['alpha'] = np.arange(0, 1, 0.01)

search = RandomizedSearchCV(model, grid, scoring='r2', n_jobs=-1)

results = search.fit(x_train, y_train)

print('Best: %s' % results.best_params_)

In [None]:
#training model with best alpha
best_ridge = Ridge(alpha=0.99)
best_ridge.fit(x_train,y_train)
ridge_pred = best_ridge.predict(x_test)

mae = mean_absolute_error(y_test, ridge_pred)
print("MAE : % f" %(mae))
rmse = np.sqrt(mean_squared_error(y_test, ridge_pred))
print("RMSE : % f" %(rmse))
r2 = r2_score(y_test, ridge_pred)
print("R2 : % f" %(r2))

visualizations

In [None]:
#visualize prediction error
visualizer = prediction_error(svr_best, x_train, y_train, x_test, y_test, identity=False)
#visualizer = prediction_error(best_xgbr, x_train, y_train, x_test, y_test, identity=False)
#visualizer = prediction_error(best_ridge, x_train, y_train, x_test, y_test, identity=False)


In [None]:
#visualize actual values & absolute error
test = y_test.array
test_ = abs(test-svr_pred)
#test_ = abs(test-xgbr_pred)
#test_ = abs(test-ridge_pred)

sns.scatterplot(x=y_test,y=test_)
plt.title('Actual values and absolute error')
plt.xlabel('Actual values')
plt.ylabel('Absolute Error')
plt.show()