In [70]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.utils import estimator_html_repr
from sklearn.metrics import mean_squared_error as MSE

import xgboost as xgb

In [5]:
df = pd.read_feather("western_cape_rentals_processed.ftr")

In [8]:
df.sample(3)

Unnamed: 0,id,link,title,description,bedrooms,bathrooms,parking_spaces,floor_area,location,address,rental_term,price
387,109763999,https://www.property24.com/to-rent/proteaville...,5 Bedroom House,"Spacious, modern renovated 5 bedroom family ho...",5.0,4.0,2.0,,Proteaville,"48 Protea Road, Proteaville Durbanville",monthly,35000.0
5565,109686608,https://www.property24.com/to-rent/bosbell/bel...,6 Bedroom House,Offers a unique mature house on a large erf. O...,6.0,6.0,5.0,,Bosbell,2 Bloem Street,monthly,25000.0
2188,110978834,https://www.property24.com/to-rent/steynsrust/...,2 Bedroom House,SHORT TERM\nUnfurnished home available from A...,2.0,1.5,2.0,,Steynsrust,,monthly,8500.0


In [96]:
sub_df = df[~(df.price.isna()) & (df.price <= 15000)].fillna(0).copy()

X = pd.get_dummies(sub_df.drop(columns=['id', 'link', 'description', 'title', 'address', 'price'])).values
y = sub_df.price.values

In [15]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=12)

In [97]:
# define model
model = xgb.XGBRegressor()

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [98]:
# force scores to be positive
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 1405.163 (48.307)


In [89]:
sub_df = df[~(df.price.isna()) & (df.price <= 15000)].fillna(0).copy()

X = pd.get_dummies(sub_df.drop(columns=['id', 'link', 'description', 'title', 'address', 'price'])).values
y = sub_df.price.values

# Splitting
train_X, test_X, train_y, test_y = train_test_split(X, y, 
                      test_size = 0.3, random_state = 12)

# Train and test set are converted to DMatrix objects,
# as it is required by learning API.
train_dmatrix = xgb.DMatrix(data = train_X, label = train_y)
test_dmatrix = xgb.DMatrix(data = test_X, label = test_y)

In [90]:
# Parameter dictionary specifying base learner
param = {"booster":"gblinear", "objective":"reg:squarederror"}
  
xgb_r = xgb.train(params = param, dtrain = train_dmatrix, num_boost_round = 10)
pred = xgb_r.predict(test_dmatrix)

In [86]:
df[df.price == 1500000].values[0][1]

'https://www.property24.com/to-rent/victoria-park/somerset-west/western-cape/9020/110979894'

In [91]:
# RMSE Computation
rmse = np.sqrt(MSE(test_y, pred))
print("RMSE : % f" %(rmse))

RMSE :  2535.319194


In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)

In [None]:
xgb.train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, 
          maximize=False, early_stopping_rounds=None, evals_result=None, 
          verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None)