In [1]:
import numpy as np
import pandas as pd

from pymongo import MongoClient
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score

from rentright.utils.mongo import get_mongoclient

In [2]:
mongoclient = get_mongoclient()
units = mongoclient.scraper.unit.find()
df = pd.DataFrame(list(units))
df.fillna(False, inplace=True)
df = df[(df['price'] < 25000) & (df['sqft'] != 0) & (df['sqft'] < 10000)]

In [3]:
features = list(set(df.columns) - set(['_id', 'description', 'listing_id', 'price', 'title']))

In [4]:
X = df[features]
y = df['price']

In [8]:
gbr = GradientBoostingRegressor(loss='ls', learning_rate=0.01, n_estimators=1000, max_depth=5, subsample=0.5)

In [9]:
def printtable(headers, error, r2):
    header = '|' + '|'.join(headers) + '|'
    align = '|:-:|--:|--:|'
    rows = ['|{}|{:6.2f}|{:6.2f}|'.format(i + 1, error, r2) for i, (error, r2) in enumerate(zip(error,r2))]
    rows = '\n'.join(rows)
    mean = '|Mean|{:6.2f}|{:6.2f}|'.format(np.mean(error), np.mean(r2))
    std = '|Std|{:6.2f}|{:6.2f}|'.format(np.std(error), np.std(r2))
    return '\n'.join([header, align, rows, mean, std])

In [10]:
error = list(map(abs,cross_val_score(gbr, X, y, cv=5, scoring='neg_mean_absolute_error')))
print('Done with the first cross-val run')
r2 = cross_val_score(gbr, X, y, cv=5)

headers = ['Run', 'Error', 'R<sup>2</sup>']    

print(printtable(headers, error, r2))

Done with the first cross-val run
|Run|Error|R<sup>2</sup>|
|:-:|--:|--:|
|1|260.75|  0.43|
|2|224.28|  0.76|
|3|191.94|  0.78|
|4|197.16|  0.68|
|5|188.40|  0.68|
|Mean|212.51|  0.67|
|Std| 27.22|  0.13|
