In [None]:
# Set default matplotlib figure size
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]

# current version of seaborn generates a bunch of warnings that we'll ignore
import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm
import seaborn as sns
from pandas import DataFrame, read_excel

# Models for the housing market

This dataset contains (506 cases) information collected by the U.S Census Service concerning housing in the area of Boston Mass.

Toolkit: 
  * **Seaborn, statistical data visualization [docs](https://seaborn.pydata.org/api.html)**
  * **Statsmodels, statistical models in python [docs](https://www.statsmodels.org/stable/index.html)**

```
CRIM - per capita crime rate by town
ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
INDUS - proportion of non-retail business acres per town.
CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
NOX - nitric oxides concentration (parts per 10 million)
RM - average number of rooms per dwelling
AGE - proportion of owner-occupied units built prior to 1940
DIS - weighted distances to five Boston employment centres
RAD - index of accessibility to radial highways
TAX - full-value property-tax rate per $10,000
PTRATIO - pupil-teacher ratio by town
B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
LSTAT - perc. lower status of the population
MEDV - Median value of owner-occupied homes in $1000's
```

In [None]:
df = read_excel('../data/boston-dataset.xls')

In [None]:
f1 = sns.heatmap(df.corr(), annot=True)

In [None]:
f2 = sns.regplot(y='RM', x='MEDV', data=df)

In [None]:
x = df['RM']
y = df['MEDV']
x = sm.add_constant(x)

# Note the difference in argument order
model = sm.OLS(y, x).fit()
predictions = model.predict(x) # make the predictions by the model

# Print out the statistics
model.summary()

In [None]:
x = df[['RM', 'TAX', 'LSTAT']]
y = df['MEDV']
x = sm.add_constant(x)

# Note the difference in argument order
model = sm.OLS(y, x).fit()
predictions = model.predict(x) # make the predictions by the model

# Print out the statistics
model.summary()