In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns

from pymongo import MongoClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from textstat.textstat import textstat

from rentright.utils.mongo import get_mongoclient

In [2]:
mongoclient = get_mongoclient()

In [3]:
units = mongoclient.scraper.unit.find()

In [4]:
units.count()

23896

In [5]:
# Pull out t he  descriptions  and  titles
# descs = [unit['description'] for unit in units]
# titles = [unit['title'] for unit in units]
# units.rewind()

In [6]:
u = units[0]

In [7]:
dct = dict(u)

In [8]:
dct.popitem()

('num_images', 12)

In [9]:
dct.pop('_id')

ObjectId('59f0c620be110116cbdc7a42')

In [10]:
# for unit in units:
#     meta_data = stats()
    
    

In [11]:
unit = units[2]

In [12]:
lat = unit['latitude']
log = unit['longitude']

In [13]:
unit

{'_id': ObjectId('59f0c620be110116cbdc7a44'),
 'apartment': True,
 'attached garage': True,
 'bathrooms': 1.0,
 'bedrooms': 1,
 'cats are OK - purrr': True,
 'description': "\n \n QR Code Link to This Post \n \n \n Call or TEXT us for any info or to Schedule a Tour!  Live at a Seattle ICON  Tower 801 is shaping the future of Seattle's vibrant downtown lifestyle! It's hard not to notice this uniquely circular tower located in the heart of Seattle's entertainment and high end shopping district. Inside you will find spacious, renovated apartment homes that flow naturally to outside decks with every room. THE ELEGANCE OF THE CIRCLE Tower 801's shape is a stroke of design genius. Each apartment becomes wider as you move from hallway into the living spaces. No space is wasted. Enjoy a balcony off every room, along with panoramic views, day and night, overlooking the 24-hour lifecycle of the West Coast's most beautiful city.   Call Now:  \n show contact info \n x 59 OR Text  59  to  \n show c

In [14]:
print(lat)
print(log)

47.611400
-122.330500


In [15]:
units_list = list(units)

In [16]:
df = pd.DataFrame(units_list)

In [17]:
df.fillna(False, inplace=True)
df = df[(df['price'] < 25000) & (df['sqft'] != 0) & (df['sqft'] < 10000)]

In [18]:
features = list(set(df.columns) - set(['_id', 'description', 'listing_id', 'price', 'title']))

In [19]:
X = df[features]
y = df['price']

In [53]:
regr = RandomForestRegressor(max_depth=12000, criterion='mae')
regr.fit(X, y)
print(regr.feature_importances_)

[  9.76894627e-04   3.23883072e-01   0.00000000e+00   0.00000000e+00
   4.16134458e-06   5.12454170e-03   5.14511386e-03   0.00000000e+00
   3.59547905e-03   0.00000000e+00   8.04402269e-04   1.11173224e-03
   3.88238557e-02   0.00000000e+00   4.16129350e-04   2.16792690e-03
   3.32949081e-02   0.00000000e+00   9.59686954e-04   1.07835505e-04
   7.68200056e-03   5.83373824e-03   0.00000000e+00   2.84566288e-01
   0.00000000e+00   1.08504047e-04   7.72175648e-02   6.95452848e-04
   9.87593884e-03   5.51037915e-04   1.57107940e-03   2.62442611e-03
   2.89546912e-03   1.09260187e-01   8.07025727e-02]


In [54]:
def printtable(headers, error, r2):
    header = '|' + '|'.join(headers) + '|'
    align = '|:-:|--:|--:|'
    rows = ['|{}|{:6.2f}|{:6.2f}|'.format(i + 1, error, r2) for i, (error, r2) in enumerate(zip(error,r2))]
    rows = '\n'.join(rows)
    mean = '|Mean|{:6.2f}|{:6.2f}|'.format(np.mean(error), np.mean(r2))
    std = '|Std|{:6.2f}|{:6.2f}|'.format(np.std(error), np.std(r2))
    return '\n'.join([header, align, rows, mean, std])

In [55]:
error = list(map(abs,cross_val_score(regr, X, y, cv=5, scoring='neg_mean_absolute_error')))
r2 = cross_val_score(regr, X, y, cv=5)

headers = ['Run', 'Error', 'R<sup>2</sup>']    

print(printtable(headers, error, r2))

|Run|Error|R<sup>2</sup>|
|:-:|--:|--:|
|1|104.32|  0.92|
|2|136.48|  0.83|
|3|127.91|  0.82|
|4| 95.63|  0.88|
|5|100.07|  0.92|
|Mean|112.88|  0.87|
|Std| 16.23|  0.04|


In [43]:
df.price.mean()

1908.3911329592356

In [44]:
zip98122 = df[df['zipcode'] == '98122']
X = zip98122[features]
y = zip98122['price']

In [45]:
error = list(map(abs,cross_val_score(regr, X, y, cv=5, scoring='neg_mean_absolute_error')))
r2 = cross_val_score(regr, X, y, cv=5)

headers = ['Run', 'Error', 'R<sup>2</sup>']    

print(printtable(headers, error, r2))

|Run|Error|R<sup>2</sup>|
|:-:|--:|--:|
|1|147.55|  0.84|
|2|119.62|  0.88|
|3|147.24|  0.85|
|4|100.21|  0.91|
|5|124.96|  0.89|
|Mean|127.92|  0.88|
|Std| 17.91|  0.02|


In [46]:
zip98122.price.mean()

2060.0065693430656

In [47]:
zip98102 = df[df['zipcode'] == '98102']
X = zip98102[features]
y = zip98102['price']

In [48]:
error = list(map(abs,cross_val_score(regr, X, y, cv=5, scoring='neg_mean_absolute_error')))
r2 = cross_val_score(regr, X, y, cv=5)

headers = ['Run', 'Error', 'R<sup>2</sup>']    

print(printtable(headers, error, r2))

|Run|Error|R<sup>2</sup>|
|:-:|--:|--:|
|1|113.40|  0.92|
|2|149.16|  0.85|
|3|117.42|  0.82|
|4| 92.90|  0.89|
|5|102.68|  0.91|
|Mean|115.11|  0.88|
|Std| 19.06|  0.04|


In [49]:
zip98102.price.mean()

2102.9175392670159