In [64]:
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format='retina'

# data prep
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

# models
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor

# metrics
from sklearn.metrics import mean_squared_error

First, keep only the records where ocean_proximity is either `'<1H OCEAN'` or `'INLAND'`

In [42]:
df = pd.read_csv('../data/housing.csv')
values = df.ocean_proximity.value_counts().head(2).index
df = df.query('ocean_proximity in @values')
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
701,-121.97,37.64,32.0,1283.0,194.0,485.0,171.0,6.0574,431000.0,<1H OCEAN
830,-121.99,37.61,9.0,3666.0,711.0,2341.0,703.0,4.6458,217000.0,<1H OCEAN
859,-121.97,37.57,21.0,4342.0,783.0,2172.0,789.0,4.6146,247600.0,<1H OCEAN
860,-121.96,37.58,15.0,3575.0,597.0,1777.0,559.0,5.7192,283500.0,<1H OCEAN
861,-121.98,37.58,20.0,4126.0,1031.0,2079.0,975.0,3.6832,216900.0,<1H OCEAN


In [43]:
# check if query is correct
df.ocean_proximity.unique()

array(['<1H OCEAN', 'INLAND'], dtype=object)

Preparation:

* Fill missing values with zeros.
* Apply the log tranform to `median_house_value`.
* Do train/validation/test split with 60%/20%/20% distribution. 
* Use the `train_test_split` function and set the `random_state` parameter to 1.
* Use `DictVectorizer(sparse=True)` to turn the dataframes into matrices.

In [44]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        157
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [45]:
# 1. fill nulls with zeros
df.total_bedrooms = df.total_bedrooms.fillna(0)

In [46]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [47]:
# log transformation for the median_house_value
df['mhv_log'] = np.log1p(df.median_house_value)

In [48]:
# split data 60/20/20
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [49]:
# check the split
df_train.shape[0], df_val.shape[0], df_test.shape[0]

(9411, 3138, 3138)

In [50]:
df.shape[0] == (df_train.shape[0] + df_val.shape[0] +df_test.shape[0])

True

In [51]:
# reset indexes
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [52]:
df_test.head(2)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,mhv_log
0,-122.02,37.32,27.0,4336.0,754.0,2009.0,734.0,6.3923,348300.0,<1H OCEAN,12.760822
1,-121.32,37.97,43.0,2453.0,490.0,1093.0,438.0,2.9107,88800.0,INLAND,11.394153


In [54]:
cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income', 'ocean_proximity',]

In [55]:
# create a vectorizer and prepare X and y sets
dv = DictVectorizer(sparse=False)

# prepare dictionaries
dict_train = df_train[cols].to_dict(orient="records")
dict_val = df_val[cols].to_dict(orient="records")
dict_test = df_test[cols].to_dict(orient="records")

# get X_train, X_val, X_test
X_train = dv.fit_transform(dict_train)
X_val = dv.transform(dict_val)
X_test = dv.transform(dict_test)

y_train = df_train.mhv_log.values
y_val = df_val.mhv_log.values
y_test = df_test.mhv_log.values

In [56]:
y_train.shape[0]

9411

In [57]:
y_train[:5]

array([11.80634853, 12.68168862, 11.61187447, 12.03291497, 11.96145645])

In [58]:
X_train[:5]

array([[ 6.8000e+01,  1.3000e+01,  3.7930e+01, -1.2025e+02,  3.3750e+00,
         0.0000e+00,  1.0000e+00,  1.9600e+02,  7.6000e+01,  4.9300e+02],
       [ 2.6800e+02,  1.0000e+01,  3.4000e+01, -1.1840e+02,  5.8083e+00,
         1.0000e+00,  0.0000e+00,  7.0500e+02,  3.3900e+02,  1.5260e+03],
       [ 1.2200e+02,  4.6000e+01,  3.8850e+01, -1.2167e+02,  1.7417e+00,
         0.0000e+00,  1.0000e+00,  4.1000e+02,  1.3100e+02,  6.4500e+02],
       [ 8.8100e+02,  2.4000e+01,  3.3880e+01, -1.1814e+02,  2.6641e+00,
         1.0000e+00,  0.0000e+00,  2.0850e+03,  9.8200e+02,  3.3050e+03],
       [ 2.6400e+02,  3.4000e+01,  3.3920e+01, -1.1811e+02,  4.1767e+00,
         1.0000e+00,  0.0000e+00,  9.8300e+02,  2.6300e+02,  1.4140e+03]])

In [59]:
X_train.shape

(9411, 10)

In [60]:
dv.get_feature_names()

['households',
 'housing_median_age',
 'latitude',
 'longitude',
 'median_income',
 'ocean_proximity=<1H OCEAN',
 'ocean_proximity=INLAND',
 'population',
 'total_bedrooms',
 'total_rooms']

## Question 1

Let's train a decision tree regressor to predict the `median_house_value` variable. 

* Train a model with `max_depth=1`.


Which feature is used for splitting the data?

* `ocean_proximity` <- correct
* `total_rooms`
* `latitude`
* `population`

In [61]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=1, random_state=1)

In [62]:
print(export_text(dt, feature_names=dv.get_feature_names()))

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



## Question 2

Train a random forest model with these parameters:

* `n_estimators=10`
* `random_state=1`
* `n_jobs=-1` (optional - to make training faster)


What's the RMSE of this model on validation?

* 0.045
* 0.245 <- correct
* 0.545
* 0.845

In [66]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
np.sqrt(mean_squared_error(y_val, y_pred)).round(3)

0.245

## Question 3

Now let's experiment with the `n_estimators` parameter

* Try different values of this parameter from 10 to 200 with step 10.
* Set `random_state` to `1`.
* Evaluate the model on the validation dataset.


After which value of `n_estimators` does RMSE stop improving?

- 10
- 25
- 50
- 160