# Load the Data

In [150]:
# python standard library
import pickle

# third party
import matplotlib.pyplot as plot
import numpy
import pandas
import pylab
import seaborn
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, explained_variance_score

from tabulate import tabulate

In [156]:
%matplotlib inline
pylab.rcParams['figure.figsize'] = (10, 8)

FEATURES = 'house_sales_features.pkl'
features_map = pickle.load(open(FEATURES))

In [3]:
data = pandas.read_csv('large_data/home_data.csv')

## Split Data

In [26]:
x_train, x_test, y_train, y_test = train_test_split(data[[c for c in data.columns if c != 'price']], data.price, test_size=.2)

In [34]:
assert 'price' not in x_train.columns
assert 'price' not in x_test.columns
assert y_train.name == 'price'
assert y_test.name == 'price'
assert len(x_train) == len(y_train)
assert len(x_test) == len(y_test)
assert len(x_train) == int(data.price.count() * .8)

# Explore the Data

In [4]:
data.describe()

                 id           price      bedrooms     bathrooms   sqft_living  \
count  2.161300e+04    21613.000000  21613.000000  21613.000000  21613.000000   
mean   4.580302e+09   540088.141905      3.370842      2.114757   2079.899736   
std    2.876566e+09   367127.195968      0.930062      0.770163    918.440897   
min    1.000102e+06    75000.000000      0.000000      0.000000    290.000000   
25%    2.123049e+09   321950.000000      3.000000      1.750000   1427.000000   
50%    3.904930e+09   450000.000000      3.000000      2.250000   1910.000000   
75%    7.308900e+09   645000.000000      4.000000      2.500000   2550.000000   
max    9.900000e+09  7700000.000000     33.000000      8.000000  13540.000000   

             sqft_lot        floors    waterfront          view     condition  \
count    21613.000000  21613.000000  21613.000000  21613.000000  21613.000000   
mean     15106.967566      1.494309      0.007542      0.234303      3.409430   
std      41420.511515      

In [5]:
data.shape

(21613, 21)

In [38]:
any(data[column].hasnans for column in data.columns)

False

The target variable is `price`.

In [10]:
axe = seaborn.distplot(data.price)
axe.axvline(data.price.median())
title = axe.set_title('Distribution of housing prices in King County')

<matplotlib.figure.Figure at 0x7f121df056d0>

In [13]:
axe = seaborn.boxplot(data.price)

<matplotlib.figure.Figure at 0x7f121dc8fa10>

In [14]:
axe = seaborn.violinplot(data.price)

<matplotlib.figure.Figure at 0x7f121dbb1990>

The plots show that the data is right-skewed with a significant number of outliers.

In [7]:
for column in (c for c in data.columns if c.startswith('sqft')):
    figure = plot.figure()
    axe = figure.gca()
    datum = data[column]
    median = datum.median()
    axe = seaborn.distplot(datum, ax=axe)
    axe.axvline(median)

<matplotlib.figure.Figure at 0x7f121d578ad0>

<matplotlib.figure.Figure at 0x7f121d75aa90>

<matplotlib.figure.Figure at 0x7f121d8bc750>

<matplotlib.figure.Figure at 0x7f121da9f710>

<matplotlib.figure.Figure at 0x7f121dc03710>

<matplotlib.figure.Figure at 0x7f121e0bf090>

In [8]:
for column in (c for c in data.columns if c.startswith('sqft')):
    figure = plot.figure()
    axe = figure.gca()
    seaborn.boxplot(data[column])

<matplotlib.figure.Figure at 0x7f121d3d4710>

<matplotlib.figure.Figure at 0x7f121d867210>

<matplotlib.figure.Figure at 0x7f121d5dd850>

<matplotlib.figure.Figure at 0x7f121d5b5bd0>

<matplotlib.figure.Figure at 0x7f121d300b10>

<matplotlib.figure.Figure at 0x7f121e04db50>

# Create sqft_living Model

In [48]:
sqft_model = LinearRegression(fit_intercept=True)
sqft_living = x_train.sqft_living.reshape(-1, 1)
sqft_model.fit(sqft_living, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [77]:
y_fit_train = sqft_model.predict(sqft_living)

In [78]:
figure = plot.figure()
axe = figure.gca()
lines = axe.plot(x_train.sqft_living, y_train, '.', label='training data')
lines = axe.plot(sqft_living, y_fit_train,
                 label='model')
axe.set_xlabel('Living Area (square feet)')
axe.set_ylabel('Price of House ($)')
axe.legend(loc='lower right')
title = axe.set_title("SQ. Foot Living Model (Training Data)")

<matplotlib.figure.Figure at 0x7f121d320f10>

In [66]:
print('slope: {0}'.format(sqft_model.coef_[0]))
print('intercept: {0}'.format(sqft_model.intercept_))

slope: 279.705255561
intercept: -42540.7511258


The slope indicates that the price of a house is predicted to rise roughly $279.71 for every square foot of increase in house living area.

## Test Data

In [79]:
test_sqft_living = x_test.sqft_living.reshape(-1, 1)
y_fit = sqft_model.predict(test_sqft_living)

In [80]:
figure = plot.figure()
axe = figure.gca()
line = axe.scatter(x_test.sqft_living, y_test, label='Test Data')
line = axe.plot(x_test.sqft_living, y_fit, label='model')
axe.legend()
axe.set_ylabel('Price ($)')
axe.set_xlabel('Living Area (square feet)')
title = axe.set_title("SQ Foot Living Model (Test Data)")

<matplotlib.figure.Figure at 0x7f121d573b10>

In [173]:
model = LinearRegression(fit_intercept=True)
sqft_living = data.sqft_living.reshape(-1, 1)
scores = cross_val_score(model, sqft_living, data.price, cv=10)
predictions = cross_val_predict(model, sqft_living, data.price, cv=10)
print("R2: {0:.2f} +/- {1:.2f}".format(scores.mean(), 2 * scores.std()))

R2: 0.49 +/- 0.03


In [117]:
figure = plot.figure()
axe = figure.gca()
lines = axe.plot(sqft_living, data.price, '.', label='data')
lines = axe.plot([sqft_living.min(), sqft_living.max()],
                 [predictions.min(), predictions.max()], label='model')
axe.set_xlabel('Living Area (square feet)')
axe.set_ylabel('Price ($)')
title = axe.set_title("Cross-Validation Model")

<matplotlib.figure.Figure at 0x7f121db09310>

In [128]:
sqft_model.predict(data.sqft_living.reshape(-1, 1))

array([ 287511.45043663,  676301.75566694,  172832.29565647, ...,
        242758.60954681,  404987.65777241,  242758.60954681])

In [154]:
rmse = numpy.sqrt(mean_squared_error(data.price, predictions))
print(tabulate([['Max Error', max(data.price - predictions)],
                ['RMSE',rmse],
                ["r^2", r2_score(data.price, predictions)],
                ["explained variance", explained_variance_score(data.price, predictions)]],
               tablefmt='orgtbl',
               headers="Metric Value".split()))

| Metric             |            Value |
|--------------------+------------------|
| Max Error          |      4.40341e+06 |
| RMSE               | 261627           |
| r^2                |      0.492132    |
| explained variance |      0.492132    |


# Create my_features Model

In [159]:
features_map['my_features_model']

{'features': ['bedrooms',
  'bathrooms',
  'sqft_living',
  'sqft_lot',
  'floors',
  'zipcode'],
 'max error': 3486584.509381705,
 'prediction 2': 1446472.4690774973,
 'prediction bill gates': 13749825.525719076,
 'rmse': 179542.4333126903}

In [160]:
my_features_model = LinearRegression(fit_intercept=True)

# Compare sqft_living to my_features

# Predict 'bill_gates' price

# Quiz

## Find the mean sales price of the most expensive zip code

## Find houses with sqft_living between 2,000 and 4,000 square feet

What fraction of the houses fall in this range?

# Create advanced_features model

## Compare the three models