In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../cleaning/accommodation.csv')
df.head()

Unnamed: 0,Source,Location,Number of Beds,Type,Price (HKD)
0,Hotel,Kita,,Apartment,589.0
1,Hotel,Taito,2.0,Hotel Room,621.0
2,Hotel,Shinagawa,,Apartment,1807.0
3,Hotel,Sumida,,Apartment,811.0
4,Hotel,Taito,1.0,Hotel Room,378.0


In [4]:
df.dtypes

Source             object
Location           object
Number of Beds    float64
Type               object
Price (HKD)       float64
dtype: object

In [5]:
df.isna().sum()

Source             0
Location           0
Number of Beds    75
Type               0
Price (HKD)        0
dtype: int64

Removing outliers

In [31]:
df.describe()

Unnamed: 0,Number of Beds,Price (HKD)
count,1225.0,1300.0
mean,1.541224,998.986923
std,1.367267,987.976076
min,1.0,17.0
25%,1.0,515.5
50%,1.0,738.5
75%,1.0,1104.0
max,16.0,10953.0


In [6]:
X_train = df.drop(columns='Price (HKD)')
y_train = df['Price (HKD)']

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

cat_var = ['Source', 'Location', 'Type']

var_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('oh_encoder', OneHotEncoder(), cat_var),
    ('imputer', var_pipeline, ['Number of Beds'])
])

In [15]:
X_train_prepared = full_pipeline.fit_transform(X_train)

In [16]:
X_train_prepared.shape

(1300, 42)

In [36]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

lin_reg = LinearRegression()
elastic_reg = ElasticNet(alpha=1, l1_ratio=0.9)
svm_reg = SVR(C=10)


regressors = {'Linear Regressor': lin_reg, 'Elastic Net Regressor': elastic_reg, 'SVM Regressor': svm_reg}

for name, regressor in regressors.items():
    scores = cross_val_score(regressor, X_train_prepared, y_train, scoring='neg_mean_absolute_error', cv=5)
    print(name)
    print(f'mean score: {-np.mean(scores)}')
    print(f'standard deviation: {np.std(scores)}')

Linear Regressor
mean score: 549.7682529742972
standard deviation: 35.072846866536025
Elastic Net Regressor
mean score: 536.1307626410694
standard deviation: 45.414587574161445
SVM Regressor
mean score: 485.87212028758296
standard deviation: 66.30240344482918


In [37]:
gbr_reg = GradientBoostingRegressor()

gbr_grid = {
    'learning_rate': [0.1, 0.2, 0.5, 1],
    'n_estimators': [50, 80, 100, 150],
    'loss': ['squared_error', 'huber'],
    'alpha': [0.1, 0.5, 0.9]
}

In [39]:
gbr_cv = GridSearchCV(gbr_reg, gbr_grid, cv=5, scoring='neg_mean_absolute_error', verbose=4, n_jobs=-1)

gbr_cv.fit(X_train_prepared, y_train)

print(gbr_cv.best_score_)
print(gbr_cv.best_params_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
-471.9853097667807
{'alpha': 0.1, 'learning_rate': 0.2, 'loss': 'huber', 'n_estimators': 50}
