# Challenge Exercise
Create a neural network that is capable of finding the volume of a cylinder given the radius of its base (r) and its height (h). Assume that the radius and height of the cylinder are both in the range 0.5 to 2.0. Unlike in the challenge exercise for b_estimator.ipynb, assume that your measurements of r, h and V are all rounded off to the nearest 0.1. Simulate the necessary training dataset. This time, you will need a lot more data to get a good predictor.

NOTE: Original estimator had a RSME=0.140241876245 after 10 epochs

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import random
from math import pi
import shutil

print(tf.__version__)

  from ._conv import register_converters as _register_converters


1.8.0


I generate the datasets to a CSV files, so they can be huge and not being saved to MEM. This is just for training purposes, not because its actually needed.

In [36]:
COLUMNS=['radius', 'height', 'volume']

def generate_dataset(dataset_lenght):

  df_train = pd.DataFrame(1.5 * np.random.random_sample((dataset_lenght, 2)) + 0.5, columns = COLUMNS[:2])
  df_valid = pd.DataFrame(1.5 * np.random.random_sample((1000, 2)) + 0.5, columns = COLUMNS[:2])
  df_test = pd.DataFrame(1.5 * np.random.random_sample((1000, 2)) + 0.5, columns = COLUMNS[:2])

  # calculate volumes
  for dset, filename in [(df_train, 'cyl_train.csv'),
                         (df_valid, 'cyl_valid.csv'),
                         (df_test, 'cyl_test.csv'),]:
    volumes = []
    for index, row in dset.iterrows():
      volume = pi * row['radius']**2 * row['height']
      volumes.append(volume)
    dset.insert(2, COLUMNS[2], volumes, False)
    dset.to_csv(filename, index=False, header=False)

generate_dataset(100000)

In [48]:
CSV_COLUMNS = COLUMNS
LABEL_COLUMN = CSV_COLUMNS[2]

# record_defaults: A list of Tensor objects with specific types. Acceptable types are float32, float64, int32, int64, string. 
# One tensor per column of the input record, with either a scalar default value for that column or empty if the column is required.
DEFAULTS = [[], [], []]

def read_dataset(filename, mode, batch_size = 512):
  def _input_fn():
    def decode_csv(value_column):
      columns = tf.decode_csv(value_column, record_defaults = DEFAULTS)
      features = dict(zip(CSV_COLUMNS, columns))
      label = features.pop(LABEL_COLUMN)
      return features, label

    # Create list of file names that match "glob" pattern (i.e. data_file_*.csv)
    filenames_dataset = tf.data.Dataset.list_files(filename)
    # Read lines from text files
    textlines_dataset = filenames_dataset.flat_map(tf.data.TextLineDataset)
    # Parse text lines as comma-separated values (CSV)
    dataset = textlines_dataset.map(decode_csv)
    
    # Round values to 0.1 precission
    dataset = textlines_dataset.map(lambda x: round(x, 1))
    
    # Note:
    # use tf.data.Dataset.flat_map to apply one to many transformations (here: filename -> text lines)
    # use tf.data.Dataset.map      to apply one to one  transformations (here: text line -> feature list)
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        num_epochs = None # loop indefinitely
        dataset = dataset.shuffle(buffer_size = 10 * batch_size)
    else:
        num_epochs = 1 # end-of-input after this

    dataset = dataset.repeat(num_epochs).batch(batch_size)
    
    return dataset.make_one_shot_iterator().get_next()
  return _input_fn
    

def get_train():
  return read_dataset('./cyl_train.csv', mode = tf.estimator.ModeKeys.TRAIN)

def get_valid():
  return read_dataset('./cyl_valid.csv', mode = tf.estimator.ModeKeys.EVAL)

def get_test():
  return read_dataset('./cyl_test.csv', mode = tf.estimator.ModeKeys.EVAL)

In [33]:
INPUT_COLUMNS = [
    tf.feature_column.numeric_column('radius'),
    tf.feature_column.numeric_column('height'),
]

def add_more_features(feats):
  # Nothing to add (yet!)
  return feats

feature_cols = add_more_features(INPUT_COLUMNS)

In [34]:
def print_rmse(model, name, df):
  metrics = model.evaluate(input_fn = make_input_fn(df, 1))
  print('RMSE on {} dataset = {}'.format(name, np.sqrt(metrics['average_loss'])))

In [49]:
tf.logging.set_verbosity(tf.logging.INFO)

OUTDIR = 'cyl_trained'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

model = tf.estimator.DNNRegressor(hidden_units = [32, 8, 2],
      feature_columns = feature_cols, model_dir = OUTDIR)

model.train(input_fn = get_train(), steps = 1000)

print_rmse(model, 'validation', df_valid)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7bec92df50>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'cyl_trained', '_global_id_in_cluster': 0, '_save_summary_steps': 100}


TypeError: a float is required

In [43]:
pcount=5
predictions = model.predict(input_fn = make_prediction_input_fn(df_valid[:pcount], 1))
print df_valid[0:pcount]
for i in xrange(pcount):
  print(predictions.next())
  
  

   radius  height  volume
0     1.6     1.2    10.1
1     0.5     0.8     0.8
2     1.1     1.8     6.8
3     0.5     1.3     1.2
4     0.5     1.3     1.2
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from cyl_trained/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
{'predictions': array([9.538723], dtype=float32)}
{'predictions': array([0.5228757], dtype=float32)}
{'predictions': array([6.858327], dtype=float32)}
{'predictions': array([1.221811], dtype=float32)}
{'predictions': array([1.221811], dtype=float32)}
