<a href="https://colab.research.google.com/github/mmistroni/TensorFlowPlayground/blob/master/California_Housing_Sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
import os
import urllib
import tarfile
from datetime import datetime
    

In [0]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = "datasets/housing"
HOUSING_URL = DOWNLOAD_ROOT + HOUSING_PATH + "/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    # No Luck, 
    # Instead we need to
    # 1. download original dataset from book repo
    if not os.path.isdir(housing_path):
        os.makedirs(housing_path)
        
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()
    
    # 2. create a pipeline to clean up the data
    # 3. feed the data to tensor flow
    
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)
    
def transform_using_pipeline(housing):
    from sklearn.base import BaseEstimator,TransformerMixin
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler, LabelBinarizer, Imputer
    from sklearn.pipeline import FeatureUnion
    # column index
    rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

    class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
        def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
            self.add_bedrooms_per_room = add_bedrooms_per_room
        def fit(self, X, y=None):
            return self  # nothing else to do
        def transform(self, X, y=None):
            rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
            population_per_household = X[:, population_ix] / X[:, household_ix]
            if self.add_bedrooms_per_room:
                bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
                return np.c_[X, rooms_per_household, population_per_household,
                             bedrooms_per_room]
            else:
                return np.c_[X, rooms_per_household, population_per_household]

            
    
    
    class DataFrameSelector(BaseEstimator, TransformerMixin):
        def __init__(self, attribute_names):
            self.attribute_names = attribute_names
        def fit(self, X,  y=None):
            return self
        def transform(self, X):
            return X[self.attribute_names].values
    
    print (type(housing))
    housing_num = housing.drop('ocean_proximity', axis=1)
    num_attribs = list(housing_num.columns)
    cat_attribs = ["ocean_proximity"]
    
    num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler())
         ])
                             
    cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('label_binarizer', LabelBinarizer())
                        ])
    
    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline)
        ])
    return num_pipeline.fit_transform(housing)
    
    
    
#fetch_housing_data()

def get_housing_data_as_dataframe():
    housing = load_housing_data()
    print ('out of here')
    housing.info()
    housing_prepared = transform_using_pipeline(housing)
    return housing_prepared
#type(hous)




<h3>Sample Tensorflow on Housing data</h3>

In [0]:
import numpy as np
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()
m, n  = housing.data.shape
housing_data_plus_bias = np.c_[np.ones((m,1)), housing.data]

X = tf.constant(housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
XT = tf.transpose(X)
theta = tf.matmul(tf.matmul(tf.matrix_inverse(tf.matmul(XT, X)), XT), y)

with tf.Session() as sess:
    theta_value = theta.eval()
    print (theta_value)
print ('OUt of Here. ')

[[ -3.71851807e+01]
 [  4.36337471e-01]
 [  9.39523336e-03]
 [ -1.07113101e-01]
 [  6.44792199e-01]
 [ -4.03380000e-06]
 [ -3.78137082e-03]
 [ -4.23484027e-01]
 [ -4.37219113e-01]]
OUt of Here. 


<h2> TensorFlow with manual Gradient Descent </h2>

In [0]:
# scaling data first
from sklearn import preprocessing
import numpy as np
from sklearn.datasets import fetch_california_housing


def get_optimizer(learning_rate, gradient=True):
    if gradient:
        return tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
    return tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)


housing = fetch_california_housing()
m, n  = housing.data.shape

# Scaling data, to improve performance
scaler = preprocessing.StandardScaler()
scaled_housing_data = scaler.fit_transform(housing.data)
scaled_housing_data_plus_bias = np.c_[np.ones((m,1)), scaled_housing_data]

n_epochs = 1000
learning_rate = 0.01

X = tf.constant(scaled_housing_data_plus_bias, dtype=tf.float32, name="X")
y = tf.constant(housing.target.reshape(-1,1), dtype=tf.float32, name="y")
theta = tf.Variable(tf.random_uniform([n+1,1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")
#gradients = 2/m * tf.matmul(tf.transpose(X), error)
# using autodiff
#gradients = tf.gradients(mse, [theta])[0]
#training_op = tf.assign(theta, theta - learning_rate * gradients)

#Using optimizer
optimizer = get_optimizer(learning_rate, False)
training_op = optimizer.minimize(mse)


init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        if epoch % 100 == 0:
            print ("Epoch", epoch, "mse = ", mse.eval())
        sess.run(training_op)
    best_theta = theta.eval()
    
    print ('Best Theta:%s' % best_theta)
        





Epoch 0 mse =  8.18334
Epoch 100 mse =  0.535337
Epoch 200 mse =  0.525473
Epoch 300 mse =  0.524469
Epoch 400 mse =  0.524341
Epoch 500 mse =  0.524323
Epoch 600 mse =  0.524321
Epoch 700 mse =  0.524321
Epoch 800 mse =  0.524321
Epoch 900 mse =  0.524321
Best Theta:[[ 2.06855774]
 [ 0.82963133]
 [ 0.11875387]
 [-0.26554942]
 [ 0.30571482]
 [-0.00450235]
 [-0.03932671]
 [-0.89985853]
 [-0.87051523]]


<h2> Feeding Data to Algorithm (Using Batch Gradient Descent</h2>

In [0]:
from sklearn import preprocessing
import numpy as np
from sklearn.datasets import fetch_california_housing
# Adding TensorBoard
from datetime import datetime
now = datetime.utcnow().strftime('%Y%m%d%H%M%S')
root_logdir = '/home/mmistroni/tf_logs'
logdir = '{}/run-{}'.format(root_logdir, now)

def get_optimizer(learning_rate, gradient=True):
    if gradient:
        return tf.train.GradientDescentOptimizer(learning_rate = learning_rate)
    return tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.9)


housing = fetch_california_housing()
m, n  = housing.data.shape

# Scaling data, to improve performance
scaler = preprocessing.StandardScaler()
scaled_housing_data = scaler.fit_transform(housing.data)
scaled_housing_data_plus_bias = np.c_[np.ones((m,1)), scaled_housing_data]

# 
batch_size = 1000
n_batches = int(np.ceil(m / batch_size))

def fetch_batch(epoch, batch_index, batch_size):
    np.random.seed(epoch * n_batches + batch_index)  # not shown in the book
    indices = np.random.randint(m, size=batch_size)  # not shown
    X_batch = scaled_housing_data_plus_bias[indices] # not shown
    y_batch = housing.target.reshape(-1, 1)[indices] # not shown
    return X_batch, y_batch

n_epochs = 1000
learning_rate = 0.01

X = tf.placeholder(tf.float32, shape=(None, n+1), name="X")
y = tf.placeholder(tf.float32, shape=(None, 1), name="y")


theta = tf.Variable(tf.random_uniform([n+1,1], -1.0, 1.0), name="theta")
y_pred = tf.matmul(X, theta, name="predictions")
error = y_pred - y
mse = tf.reduce_mean(tf.square(error), name="mse")

optimizer = get_optimizer(learning_rate, False)
training_op = optimizer.minimize(mse)

init = tf.global_variables_initializer()
saver = tf.train.Saver() # Saving data
mse_summary = tf.summary.scalar('MSE', mse)

file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

with tf.Session() as sess:
    sess.run(init)
    for epoch in range(n_epochs):
        for batch_index in range(n_batches):
            X_batch, y_batch = fetch_batch(epoch, batch_index, batch_size)
            if batch_index % 10 == 0:
                summary_str = mse_summary.eval(feed_dict={X:X_batch, y:y_batch})
                step = epoch * n_batches + batch_index
                file_writer.add_summary(summary_str, step)
            sess.run(training_op, feed_dict={X:X_batch, y:y_batch})
    best_theta = theta.eval()
    print ('Best Theta:%s' % best_theta)
    
    save_path = saver.save(sess, '/home/mmistroni/tf_model/my_model_final.ckpt')
    print ('Data saved to:%s' % save_path)
file_writer.close()

Best Theta:[[ 2.07177997]
 [ 0.85556298]
 [ 0.10774077]
 [-0.29117864]
 [ 0.3478246 ]
 [-0.00871286]
 [-0.03593429]
 [-0.89287072]
 [-0.88272148]]
Data saved to:/home/mmistroni/tf_model/my_model_final.ckpt


<h2> Run same algorithm using tf.estimator </h2>


In [0]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn import preprocessing
from datetime import datetime
print(tf.__version__)
TRAIN_STEPS = 10000
PRICE_NORM_FACTOR=1000
EVAL_INTERVAL = 300
ROOT_DIR = '/home/mmistroni/tf_logs/run-{}'
#2.1 Next STep: Create only one function to handle Train and Evaluate
#2.2 Create features out of csv data
#3.Learn how to interpret TensorBoard
#3.1 Find out why there is no output


# Creating a TrainFn and a TestFn
def _train_fn(features, labels, batch_size):
    
    def _train():
        """An input function for training"""
        # Convert the inputs to a Dataset.
        dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

        # Shuffle, repeat, and batch the examples.
        dataset = dataset.repeat(None).batch(batch_size)
        # This will now return batches of features, label
        return dataset.make_one_shot_iterator().get_next()
    return _train

def _test_fn(features, labels, batch_size):
    def _test():
        """An input function for training"""
        # Convert the inputs to a Dataset.
        dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
        # Shuffle, repeat, and batch the examples.
        dataset = dataset.repeat(1).batch(batch_size)
        # This will now return batches of features, label
        return dataset.make_one_shot_iterator().get_next()
    return _test

def get_estimator(feature_cols, run_config, output_dir, linear=True):
    if bool(linear):
        return tf.estimator.LinearRegressor(feature_columns=feature_cols,
                                             config=run_config,
                                             model_dir=output_dir)
    return tf.estimator.DNNRegressor(
                       model_dir = output_dir,
                       feature_columns = feature_cols,
                       hidden_units = [64, 42],
                       config = run_config)

    


batch_size=100
# Version 1. Using california housing
cal_housing = fetch_california_housing()    #get_housing_data_as_dataframe()

# split 80/20 train-test
X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
                                                    cal_housing.target,
                                                    test_size=0.2,
                                                    random_state=1)
feature_names = cal_housing.feature_names



print (type(X_train))
print (X_train.dtype.names)
# Train data
# Loop incorrect, shape is (16512, 8)
features_train = dict((fn,X_train[:, [idx]]) for idx, fn in enumerate(feature_names))
labels_train = y_train

# Test data
features_test = dict((fn,X_test[:, [idx]]) for idx, fn in enumerate(feature_names))
labels_test = y_test

# Building features for TF
feature_columns = [tf.feature_column.numeric_column(colName) for colName in feature_names]

# Doint the real work
run_config = tf.estimator.RunConfig(save_checkpoints_secs = EVAL_INTERVAL,
                                      keep_checkpoint_max = 3)
  
#estimator = get_estimator(feature_columns, run_config, 'california_housing', linear=False)

output_dir = ROOT_DIR.format(datetime.utcnow().strftime('%Y%m%d%H%M%S'))
estimator = get_estimator(feature_columns, run_config, output_dir ,linear=False) 

tf.logging.info('Executing estimator of type %s', type(estimator))

train_spec = tf.estimator.TrainSpec(
                       input_fn=_train_fn(features=features_train,
                                           labels=labels_train,
                                           batch_size=100),
                       max_steps = TRAIN_STEPS)


# Evaluate how the model performs on data it has not yet seen.
eval_spec = tf.estimator.EvalSpec(
                       input_fn=_test_fn(features=features_test, 
                                               labels=labels_test,
                                               batch_size=100),
                       steps = None,
                       start_delay_secs = 60, # start evaluating after N seconds
                       throttle_secs = EVAL_INTERVAL)
 
print ('Train and Evaluate....')
# Training/Evaluating:
tf.logging.set_verbosity(tf.logging.INFO) # Just to have some logs to display for demonstration

tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


1.8.0
<class 'numpy.ndarray'>
None
Train and Evaluate....
