# Regression Exercise on House Price

California Housing Data

This data set contains information about all the block groups in California from the 1990 Census. In this sample a block group on average includes 1425.5 individuals living in a geographically compact area. 

The task is to aproximate the median house value of each block from the values of the rest of the variables. 

 It has been obtained from the LIACC repository. The original page where the data set can be found is: http://www.liaad.up.pt/~ltorgo/Regression/DataSets.html.
 

The Features:
 
* housingMedianAge: continuous. 
* totalRooms: continuous. 
* totalBedrooms: continuous. 
* population: continuous. 
* households: continuous. 
* medianIncome: continuous. 
* medianHouseValue: continuous. 

## Libraries

In [40]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
%matplotlib inline

## Data

In [4]:
df = pd.read_csv('./data/cal_housing_clean.csv')
df.head()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0


In [6]:
df.columns

Index(['housingMedianAge', 'totalRooms', 'totalBedrooms', 'population',
       'households', 'medianIncome', 'medianHouseValue'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 7 columns):
housingMedianAge    20640 non-null float64
totalRooms          20640 non-null float64
totalBedrooms       20640 non-null float64
population          20640 non-null float64
households          20640 non-null float64
medianIncome        20640 non-null float64
medianHouseValue    20640 non-null float64
dtypes: float64(7)
memory usage: 1.1 MB


In [9]:
df.describe()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome,medianHouseValue
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,28.639486,2635.763081,537.898014,1425.476744,499.53968,3.870671,206855.816909
std,12.585558,2181.615252,421.247906,1132.462122,382.329753,1.899822,115395.615874
min,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,18.0,1447.75,295.0,787.0,280.0,2.5634,119600.0
50%,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


## Train-Test Split

In [10]:
x_data = df.drop(['medianHouseValue'], axis=1)
x_data.head()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome
0,41.0,880.0,129.0,322.0,126.0,8.3252
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014
2,52.0,1467.0,190.0,496.0,177.0,7.2574
3,52.0,1274.0,235.0,558.0,219.0,5.6431
4,52.0,1627.0,280.0,565.0,259.0,3.8462


In [11]:
y_label = df['medianHouseValue']
y_label.head()

0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
Name: medianHouseValue, dtype: float64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_label, test_size=0.3, random_state=42)
print('X Train: ' + str(X_train.shape))
print('X Test: ' + str(X_test.shape))

X Train: (14448, 6)
X Test: (6192, 6)


### Scale the Feature Data

In [17]:
scaler = MinMaxScaler()
scaler.fit(X_train, y_train)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [25]:
X_train_norm = pd.DataFrame(data=scaler.transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_norm = pd.DataFrame(data=scaler.transform(X_test), columns=X_train.columns, index=X_test.index)

In [24]:
X_train_norm.head()

Unnamed: 0,housingMedianAge,totalRooms,totalBedrooms,population,households,medianIncome
7061,0.666667,0.073502,0.061608,0.034054,0.06693,0.250431
14689,0.372549,0.066851,0.082713,0.027916,0.081401,0.162977
17323,0.45098,0.049011,0.043606,0.020404,0.046703,0.255355
10056,0.254902,0.024766,0.023277,0.00838,0.022529,0.179963
15750,1.0,0.097655,0.109714,0.042686,0.109028,0.193577


### Create Feature Columns

In [26]:
df.columns

Index(['housingMedianAge', 'totalRooms', 'totalBedrooms', 'population',
       'households', 'medianIncome', 'medianHouseValue'],
      dtype='object')

In [27]:
age = tf.feature_column.numeric_column('housingMedianAge')
rooms = tf.feature_column.numeric_column('totalRooms')
bedrooms = tf.feature_column.numeric_column('totalBedrooms')
pop = tf.feature_column.numeric_column('population')
households = tf.feature_column.numeric_column('households')
income = tf.feature_column.numeric_column('medianIncome')

In [28]:
feature_cols = [age,rooms,bedrooms,pop,households,income]

Create the input function for the estimator object

In [29]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train ,batch_size=10,num_epochs=1000, shuffle=True)

** Create the estimator model. Use a DNNRegressor. Play around with the hidden units! **

In [43]:
dnn_model = tf.estimator.DNNRegressor([6,6,6], feature_columns=feature_cols)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/var/folders/n6/q2m308wn7d3dr7m6xq0pnzrr0000gn/T/tmpu9r5s91k', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x130665a20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [51]:
dnn_model.train(input_fn=input_func, steps=25000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/n6/q2m308wn7d3dr7m6xq0pnzrr0000gn/T/tmpu9r5s91k/model.ckpt-2500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 2500 into /var/folders/n6/q2m308wn7d3dr7m6xq0pnzrr0000gn/T/tmpu9r5s91k/model.ckpt.
INFO:tensorflow:loss = 192855560000.0, step = 2501
INFO:tensorflow:global_step/sec: 516.724
INFO:tensorflow:loss = 559558400000.0, step = 2601 (0.197 sec)
INFO:tensorflow:global_step/sec: 619.797
INFO:tensorflow:loss = 268163530000.0, step = 2701 (0.160 sec)
INFO:tensorflow:global_step/sec: 714.188
INFO:tensorflow:loss = 280018220000.0, step = 2801 (0.137 sec)
INFO:tensorflow:global_step/sec: 677.741
INFO:tensorflow:loss = 358387840000.0, step = 2901 (0.148 sec)
INFO:tensorflow:global_step/sec: 683.252
INFO:tensorflo

INFO:tensorflow:global_step/sec: 641.219
INFO:tensorflow:loss = 146365870000.0, step = 10001 (0.159 sec)
INFO:tensorflow:global_step/sec: 590.988
INFO:tensorflow:loss = 388317000000.0, step = 10101 (0.168 sec)
INFO:tensorflow:global_step/sec: 579.509
INFO:tensorflow:loss = 292431270000.0, step = 10201 (0.170 sec)
INFO:tensorflow:global_step/sec: 588.025
INFO:tensorflow:loss = 284657200000.0, step = 10301 (0.172 sec)
INFO:tensorflow:global_step/sec: 524.162
INFO:tensorflow:loss = 251142860000.0, step = 10401 (0.189 sec)
INFO:tensorflow:global_step/sec: 633.191
INFO:tensorflow:loss = 358497650000.0, step = 10501 (0.159 sec)
INFO:tensorflow:global_step/sec: 782.791
INFO:tensorflow:loss = 399622050000.0, step = 10601 (0.127 sec)
INFO:tensorflow:global_step/sec: 701.105
INFO:tensorflow:loss = 204441900000.0, step = 10701 (0.144 sec)
INFO:tensorflow:global_step/sec: 647.358
INFO:tensorflow:loss = 52174090000.0, step = 10801 (0.153 sec)
INFO:tensorflow:global_step/sec: 642.532
INFO:tensorflow

INFO:tensorflow:loss = 243906870000.0, step = 17801 (0.168 sec)
INFO:tensorflow:global_step/sec: 712.149
INFO:tensorflow:loss = 235629400000.0, step = 17901 (0.139 sec)
INFO:tensorflow:global_step/sec: 799.341
INFO:tensorflow:loss = 210094230000.0, step = 18001 (0.125 sec)
INFO:tensorflow:global_step/sec: 666.56
INFO:tensorflow:loss = 122441600000.0, step = 18101 (0.152 sec)
INFO:tensorflow:global_step/sec: 701.621
INFO:tensorflow:loss = 130352760000.0, step = 18201 (0.143 sec)
INFO:tensorflow:global_step/sec: 641.45
INFO:tensorflow:loss = 92346000000.0, step = 18301 (0.154 sec)
INFO:tensorflow:global_step/sec: 664.761
INFO:tensorflow:loss = 297186360000.0, step = 18401 (0.151 sec)
INFO:tensorflow:global_step/sec: 661.603
INFO:tensorflow:loss = 46979380000.0, step = 18501 (0.152 sec)
INFO:tensorflow:global_step/sec: 610.669
INFO:tensorflow:loss = 219564540000.0, step = 18601 (0.162 sec)
INFO:tensorflow:global_step/sec: 648.908
INFO:tensorflow:loss = 47988220000.0, step = 18701 (0.154 s

INFO:tensorflow:global_step/sec: 682.799
INFO:tensorflow:loss = 34655363000.0, step = 25701 (0.145 sec)
INFO:tensorflow:global_step/sec: 581.712
INFO:tensorflow:loss = 125275540000.0, step = 25801 (0.172 sec)
INFO:tensorflow:global_step/sec: 634.865
INFO:tensorflow:loss = 154598330000.0, step = 25901 (0.158 sec)
INFO:tensorflow:global_step/sec: 653.424
INFO:tensorflow:loss = 128925290000.0, step = 26001 (0.153 sec)
INFO:tensorflow:global_step/sec: 622.917
INFO:tensorflow:loss = 95425315000.0, step = 26101 (0.161 sec)
INFO:tensorflow:global_step/sec: 619.856
INFO:tensorflow:loss = 248241830000.0, step = 26201 (0.161 sec)
INFO:tensorflow:global_step/sec: 753.442
INFO:tensorflow:loss = 88369140000.0, step = 26301 (0.133 sec)
INFO:tensorflow:global_step/sec: 664.263
INFO:tensorflow:loss = 222081320000.0, step = 26401 (0.151 sec)
INFO:tensorflow:global_step/sec: 730.401
INFO:tensorflow:loss = 149307490000.0, step = 26501 (0.141 sec)
INFO:tensorflow:global_step/sec: 660.935
INFO:tensorflow:l

<tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor at 0x12f9ba438>

Create a prediction input function and then use the .predict method off your estimator model to create a list or predictions on your test data.

In [52]:
predict_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, y=y_test, num_epochs=1, 
                                                         batch_size=10, shuffle=False)

In [53]:
predictions = dnn_model.predict(predict_input_func)
pred_list = list(predictions)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /var/folders/n6/q2m308wn7d3dr7m6xq0pnzrr0000gn/T/tmpu9r5s91k/model.ckpt-27500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [54]:
pred_list

[{'predictions': array([90726.97], dtype=float32)},
 {'predictions': array([185992.53], dtype=float32)},
 {'predictions': array([383647.22], dtype=float32)},
 {'predictions': array([139353.98], dtype=float32)},
 {'predictions': array([230096.7], dtype=float32)},
 {'predictions': array([137031.05], dtype=float32)},
 {'predictions': array([218612.98], dtype=float32)},
 {'predictions': array([182359.48], dtype=float32)},
 {'predictions': array([99757.96], dtype=float32)},
 {'predictions': array([313286.22], dtype=float32)},
 {'predictions': array([162421.2], dtype=float32)},
 {'predictions': array([153024.72], dtype=float32)},
 {'predictions': array([110828.28], dtype=float32)},
 {'predictions': array([96227.84], dtype=float32)},
 {'predictions': array([218371.11], dtype=float32)},
 {'predictions': array([148283.1], dtype=float32)},
 {'predictions': array([226317.8], dtype=float32)},
 {'predictions': array([151454.67], dtype=float32)},
 {'predictions': array([220237.39], dtype=float32)},


## RMSE

In [55]:
final_pred = []
for pred in pred_list:
    final_pred.append(pred['predictions'])

In [56]:
final_pred

[array([90726.97], dtype=float32),
 array([185992.53], dtype=float32),
 array([383647.22], dtype=float32),
 array([139353.98], dtype=float32),
 array([230096.7], dtype=float32),
 array([137031.05], dtype=float32),
 array([218612.98], dtype=float32),
 array([182359.48], dtype=float32),
 array([99757.96], dtype=float32),
 array([313286.22], dtype=float32),
 array([162421.2], dtype=float32),
 array([153024.72], dtype=float32),
 array([110828.28], dtype=float32),
 array([96227.84], dtype=float32),
 array([218371.11], dtype=float32),
 array([148283.1], dtype=float32),
 array([226317.8], dtype=float32),
 array([151454.67], dtype=float32),
 array([220237.39], dtype=float32),
 array([149424.25], dtype=float32),
 array([348121.], dtype=float32),
 array([313882.97], dtype=float32),
 array([109909.67], dtype=float32),
 array([296651.47], dtype=float32),
 array([112546.81], dtype=float32),
 array([125945.89], dtype=float32),
 array([130539.875], dtype=float32),
 array([209204.23], dtype=float32),


In [57]:
rmse = mean_squared_error(y_test, final_pred) ** 0.5
print('RMSE: ')
print(rmse)

RMSE: 
107408.44855437304
