# Predicting Housing Prices

## Fire up graphlab create

In [219]:
# python standard library
import pickle

# third party
import graphlab
import numpy
import locale
import matplotlib.pyplot as plt
import seaborn


In [220]:
FEATURES = 'house_sales_features.pkl'

In [221]:
%matplotlib inline

## Load some house sales data

Dataset is from house sales in King County, the region where the city of Seattle, WA is located.

In [222]:
sales = graphlab.SFrame('large_data/home_data.gl/')

## Exploring the data for housing sales 

The house price is correlated with the number of square feet of living space.

In [223]:
#graphlab.canvas.set_target('ipynb')
#sales.show(view="Scatter Plot", x="sqft_living", y="price")

figure = plt.figure()
axe = figure.gca()
lines = axe.scatter(sales['sqft_living'], sales['price'])
axe.set_xlabel('Living Space (sq. ft.)')
axe.set_ylabel('Price ($)')
title = axe.set_title('Living Space vs Price')

<matplotlib.figure.Figure at 0x7f2544b8ad10>

## Create a simple regression model of sqft_living to price

### Split data into training and testing.  

We use `seed=0` so that everyone running this notebook gets the same results.  In practice, you may set a random seed (or let GraphLab Create pick a random seed for you).  

In [224]:
RANDOM_SEED = 0

### GraphLab

In [225]:
train_data, test_data = sales.random_split(.8, seed=RANDOM_SEED)
print(train_data.shape)

(17384, 21)


## Build the regression model using only sqft_living as a feature

In [226]:
sqft_model = graphlab.linear_regression.create(train_data, target='price', features=['sqft_living'], validation_set=None)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 1
PROGRESS: Number of unpacked features : 1
PROGRESS: Number of coefficients    : 2
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.004476     | 4349521.926170     | 262943.613754 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:


## Evaluate the simple model

In [227]:
print( test_data['price'].mean())

543054.042563


In [228]:
evaluation = sqft_model.evaluate(test_data)
print(evaluation)

{'max_error': 4143550.8825285938, 'rmse': 255191.02870527358}


In [229]:
graphlab_data = {}
graphlab_data['sqft_living_model'] = {'max_error': evaluation['max_error'],
                                      'rmse': evaluation['rmse'],
                                      'features': ['sqft_living']}

RMSE of about \$255,170!

## Let's show what our predictions look like

In [230]:
figure = plt.figure()
axe = figure.gca()
lines = axe.plot(test_data['sqft_living'],test_data['price'],'.', label='data')
lines = axe.plot(test_data['sqft_living'],sqft_model.predict(test_data),'r-', label='model')
axe.set_ylabel("Price ($)")
axe.set_xlabel("Living Space (square feet)")
axe.legend()
title = axe.set_title("House Living Space vs Sale Price")


<matplotlib.figure.Figure at 0x7f2544ca2490>

Above:  blue dots are original data, red line is the prediction from the simple regression.

Below: we can view the learned regression coefficients. 

In [231]:
coefficents = sqft_model.get('coefficients')
print(coefficents)

+-------------+-------+----------------+---------------+
|     name    | index |     value      |     stderr    |
+-------------+-------+----------------+---------------+
| (intercept) |  None | -47114.0206702 | 4923.34437753 |
| sqft_living |  None | 281.957850166  | 2.16405465323 |
+-------------+-------+----------------+---------------+
[2 rows x 4 columns]



In [232]:
coefficents['value'][0]

-47114.02067021723

In [233]:
graphlab_data['sqft_living_model']['intercept'] = coefficents['value'][0]
graphlab_data['sqft_living_model']['sqft_living'] = coefficents['value'][1]

## Explore other features in the data

To build a more elaborate model, we will explore using more features.

In [234]:
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

In [235]:
sales[my_features].show()

Canvas is accessible via web browser at the URL: http://localhost:42954/index.html
Opening Canvas in default web browser.


In [236]:
sales.show(view='BoxWhisker Plot', x='zipcode', y='price')


Canvas is accessible via web browser at the URL: http://localhost:42954/index.html
Opening Canvas in default web browser.


Pull the bar at the bottom to view more of the data.  

98039 is the most expensive zip code.

## Build a regression model with more features

In [237]:
my_features_model = graphlab.linear_regression.create(train_data,target='price',features=my_features,validation_set=None)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.030271     | 3763208.270523     | 181908.848367 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:


In [238]:
print my_features

['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']


## Comparing the results of the simple model with adding more features

In [239]:
print sqft_model.evaluate(test_data)
my_evaluation = my_features_model.evaluate(test_data)
print(my_evaluation)

{'max_error': 4143550.8825285938, 'rmse': 255191.02870527358}
{'max_error': 3486584.509381705, 'rmse': 179542.4333126903}


In [240]:
graphlab_data['my_features_model'] = {'max error': my_evaluation['max_error'],
                                      'rmse':my_evaluation['rmse'],
                                      'features': my_features}

The RMSE goes down from \$255,170 to \$179,508 with more features.

## Apply learned models to predict prices of 3 houses

The first house we will use is considered an "average" house in Seattle. 

In [241]:
id_1 = '5309101200'
house1 = sales[sales['id']== id_1]

In [242]:
graphlab_data['house_1_id'] = id_1

In [243]:
house1

Columns:
	id	str
	date	datetime
	price	int
	bedrooms	str
	bathrooms	str
	sqft_living	int
	sqft_lot	int
	floors	str
	waterfront	int
	view	int
	condition	int
	grade	int
	sqft_above	int
	sqft_basement	int
	yr_built	int
	yr_renovated	int
	zipcode	str
	lat	float
	long	float
	sqft_living15	float
	sqft_lot15	float

Rows: Unknown

Data:
+------------+---------------------------+--------+----------+-----------+-------------+
|     id     |            date           | price  | bedrooms | bathrooms | sqft_living |
+------------+---------------------------+--------+----------+-----------+-------------+
| 5309101200 | 2014-06-05 00:00:00+00:00 | 620000 |    4     |    2.25   |     2400    |
+------------+---------------------------+--------+----------+-----------+-------------+
+----------+--------+------------+------+-----------+-------+------------+---------------+
| sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement |
+----------+--------+------------+------+-

<img src="http://info.kingcounty.gov/Assessor/eRealProperty/MediaHandler.aspx?Media=2916871">

In [244]:
print house1['price']

[620000, ... ]


In [245]:
prediction = sqft_model.predict(house1)
print(prediction[0])

629584.819728


In [246]:
graphlab_data['sqft_living_model']['prediction 1'] = sqft_model.predict(house1)[0]

In [247]:
print my_features_model.predict(house1)

[721918.9333272863]


In this case, the model with more features provides a worse prediction than the simpler model with only 1 feature.  However, on average, the model with more features is better.

## Prediction for a second, fancier house

We will now examine the predictions for a fancier house.

In [248]:
id_2 = '1925069082'
house2 = sales[sales['id']== id_2]

In [249]:
graphlab_data['house id 2'] = id_2

In [250]:
house2

Columns:
	id	str
	date	datetime
	price	int
	bedrooms	str
	bathrooms	str
	sqft_living	int
	sqft_lot	int
	floors	str
	waterfront	int
	view	int
	condition	int
	grade	int
	sqft_above	int
	sqft_basement	int
	yr_built	int
	yr_renovated	int
	zipcode	str
	lat	float
	long	float
	sqft_living15	float
	sqft_lot15	float

Rows: Unknown

Data:
+------------+---------------------------+---------+----------+-----------+-------------+
|     id     |            date           |  price  | bedrooms | bathrooms | sqft_living |
+------------+---------------------------+---------+----------+-----------+-------------+
| 1925069082 | 2015-05-11 00:00:00+00:00 | 2200000 |    5     |    4.25   |     4640    |
+------------+---------------------------+---------+----------+-----------+-------------+
+----------+--------+------------+------+-----------+-------+------------+---------------+
| sqft_lot | floors | waterfront | view | condition | grade | sqft_above | sqft_basement |
+----------+--------+------------+---

<img src="https://ssl.cdn-redfin.com/photo/1/bigphoto/302/734302_0.jpg">

In [251]:
print( sqft_model.predict(house2))

[1261170.404099968]


In [252]:
graphlab_data['sqft_living_model']['prediction 2'] = sqft_model.predict(house2)[0]

In [253]:
print my_features_model.predict(house2)

[1446472.4690774973]


In [254]:
graphlab_data['my_features_model']['prediction 2'] = my_features_model.predict(house2)[0]

In this case, the model with more features provides a better prediction.  This behavior is expected here, because this house is more differentiated by features that go beyond its square feet of living space, especially the fact that it's a waterfront house. 

## Last house, super fancy

Our last house is a very large one owned by a famous Seattleite.

In [255]:
bill_gates = {'bedrooms':[8], 
              'bathrooms':[25], 
              'sqft_living':[50000], 
              'sqft_lot':[225000],
              'floors':[4], 
              'zipcode':['98039'], 
              'condition':[10], 
              'grade':[10],
              'waterfront':[1],
              'view':[4],
              'sqft_above':[37500],
              'sqft_basement':[12500],
              'yr_built':[1994],
              'yr_renovated':[2010],
              'lat':[47.627606],
              'long':[-122.242054],
              'sqft_living15':[5000],
              'sqft_lot15':[40000]}

In [256]:
graphlab_data['bill_gates'] = {'house': bill_gates}

<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/d9/Bill_gates%27_house.jpg/2560px-Bill_gates%27_house.jpg">

In [257]:
bill_gates_prediction = my_features_model.predict(graphlab.SFrame(bill_gates))
print(bill_gates_prediction)

[13749825.525719076]


In [258]:
graphlab_data['my_features_model']['prediction bill gates'] = bill_gates_prediction[0]

The model predicts a price of over $13M for this house! But we expect the house to cost much more.  (There are very few samples in the dataset of houses that are this fancy, so we don't expect the model to capture a perfect prediction here.)

# Quiz

## Selection and Summary Statistics

### Mean sales price of most expensive zip-code

In [259]:
zips = sales['zipcode'].unique()
zip_mean = [(z,sales[sales['zipcode']==z]['price'].mean()) for z in zips]

In [260]:
max_mean = max(m for z,m in zip_mean)
for z,m in zip_mean:
    if m == max_mean:
        zip_code = z
print(zip_code)        

98039


In [261]:
graphlab_data['most expensive zip'] = {'zip code': zip_code}

In [262]:
locale.setlocale(locale.LC_ALL, '')
most_expensive = locale.currency(max_mean, grouping=True)
print(most_expensive)

$2,160,606.60


In [263]:
graphlab_data['most expensive zip']['mean'] = max_mean

## Filtering Data

### House Square Footage

Select houses where square footage is greater than 2,000 but no larger than 4,000 square feet.

In [264]:
filtered_sales = sales[2000 < sales['sqft_living']]
filtered_sales = filtered_sales[filtered_sales['sqft_living'] <= 4000]

In [265]:
assert min(filtered_sales['sqft_living']) > 2000
assert max(filtered_sales['sqft_living']) == 4000

### What fraction of all houses has `sqft_living` in this range?

In [266]:
fraction = len(filtered_sales)/float(len(sales))
print(fraction)

0.421875722945


In [267]:
graphlab_data['sqft_living 2000 - 4000'] = fraction

## Regression Model

In [268]:
advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house				
'grade', # measure of quality of construction				
'waterfront', # waterfront property				
'view', # type of view				
'sqft_above', # square feet above ground				
'sqft_basement', # square feet in basement				
'yr_built', # the year built				
'yr_renovated', # the year renovated				
'lat', 'long', # the lat-long of the parcel				
'sqft_living15', # average sq.ft. of 15 nearest neighbors 				
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

In [269]:
adv_model = graphlab.linear_regression.create(train_data, target='price', features=advanced_features, validation_set=None)

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 17384
PROGRESS: Number of features          : 18
PROGRESS: Number of unpacked features : 18
PROGRESS: Number of coefficients    : 127
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 0.057054     | 3469012.450686     | 154580.940736 |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:


In [270]:
my_rmse = my_features_model.evaluate(test_data)['rmse']
adv_rmse = adv_model.evaluate(test_data)['rmse']
print("RMSE Difference: {0}".format(locale.currency(my_rmse - adv_rmse, grouping=True)))

RMSE Difference: $22,711.32


In [271]:
graphlab_data['advanced_model'] = {'features': advanced_features,
                                   'rmse': adv_rmse}

In [272]:
with open(FEATURES, 'w') as dumper:
    pickle.dump(graphlab_data, dumper)