# Predicting house price using linear regression

## Required modules

In [None]:
import tensorflow as tf
import pandas as pd 
import seaborn as sns
#import tensorflow.feature_column as fc
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.datasets import boston_housing
from datetime import datetime

## download the Boston housing price dataset
(http://lib.stat.cmu.edu/datasets/boston)

The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic
prices and the demand for clean air', J. Environ. Economics & Management,
vol.5, 81-102, 1978.   Used in Belsley, Kuh & Welsch, 'Regression diagnostics
...', Wiley, 1980.   N.B. Various transformations are used in the table on
pages 244-261 of the latter.

Variables in order:

**CRIM:** per capita crime rate by town  
**ZN:** proportion of residential land zoned for lots over 25,000 sq.ft.  
**INDUS:** proportion of non-retail business acres per town  
**CHAS:** Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)  
**NOX:** nitric oxides concentration (parts per 10 million)  
**RM:** average number of rooms per dwelling  
**AGE:** proportion of owner-occupied units built prior to 1940  
**DIS:** weighted distances to five Boston employment centres  
**RAD:** index of accessibility to radial highways  
**TAX:** full-value property-tax rate per \$10,000  
**PTRATIO:** pupil-teacher ratio by town  
**B:** 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town  
**LSTAT:** % lower status of the population  
**MEDV:** Median value of owner-occupied homes in \$1000's

In [1]:
(x_train, y_train), (x_test, y_test) = boston_housing.load_data()

NameError: name 'boston_housing' is not defined

## create pandas dataframe

In [None]:
features = ['CRIM', 'ZN', 
            'INDUS', 'CHAS', 'NOX', 'RM', 'AGE',
            'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
x_train_df = pd.DataFrame(x_train, columns = features)
x_test_df = pd.DataFrame(x_test, columns = features)
y_train_df = pd.DataFrame(y_train, columns = ['MEDV'])
y_test_df = pd.DataFrame(y_test, columns = ['MEDV'])

x_train_df.head()


## inspect the data

In [None]:
sns.pairplot(x_train_df[['CRIM', 'ZN', 'INDUS', 'CHAS']], diag_kind='kde')

In [None]:
train_stats = x_train_df.describe()
train_stats = train_stats.transpose()
train_stats

## normalize the train dataset

In [None]:
mu = x_train_df.mean()
sigma = x_train_df.std()
x_train_norm = (x_train_df - mu) / sigma
x_train_norm.head()


## normalize the test dataset
I used same mu and sigma computed in the train dataset

In [None]:
x_test_norm = (x_test_df - mu) / sigma
x_test_norm.head()

## Create the model

In [None]:
def create_model():
    N_HIDDEN = 128
    N_FEATURES = 13

    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(
        N_HIDDEN,
        name='layer_1', 
        activation='relu', 
        input_shape=(N_FEATURES,)))

    model.add(tf.keras.layers.Dense(
        N_HIDDEN,
        name='layer_2',
        activation='relu'))

    model.add(tf.keras.layers.Dense(
        1,
        name='output_layer',        
        activation='linear'
    ))

    optimizer = tf.keras.optimizers.RMSprop(
        learning_rate=0.001,
        rho=0.9,
        momentum=0.0,
        epsilon=1e-7
    )

    model.compile(
        loss='mse',
        optimizer=optimizer,
        metrics=['mae', 'mse']
    )

    return model


In [None]:
model = create_model()
model.summary()

## train the model

In [None]:
logdir = "logs/scalars/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir)

BATCH_SIZE=128
EPOCHS=100

history = model.fit(
            x_train_norm,
            y_train_df,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            verbose=1,
            validation_split=0.2,
            callbacks=[tensorboard_callback]
)



In [None]:
history_df = pd.DataFrame(history.history)
history_df['epoch'] = history.epoch
history_df.tail()

In [None]:
plt.xlabel('epochs')
plt.ylabel('loss')
#plt.ylim([0, 100])
plt.plot(history_df['epoch'], history_df['loss'], label='Training')
plt.plot(history_df['epoch'], history_df['val_loss'], label='Validation', linestyle='dashed')
plt.legend(loc='upper center', shadow=False)

## make predictions

In [None]:
test_predictions = model.predict(x_test_norm)

a = plt.axis(aspect='equal')
plt.scatter(y_test, test_predictions)
lims = [0, 50]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
loss, mae, mse = model.evaluate(x_test_norm, y_test, verbose=1)

print("Testing set Mean Absolute Error: {:5.2f}".format(mae))
print("Testing set Loss: {:5.2f}".format(loss))

## examining metrics in TensorBoard

%tensorboard --logdir logs/scalars