In [1]:
def wrapper(learning_rate, dropout):

	import tensorflow as tf
	import numpy as np
	from hops import tensorboard
	from hops import hdfs

	# Training Parameters
	#learning_rate = 0.001
	num_steps = 200
	batch_size = 128

	# Network Parameters
	num_input = 784 # MNIST data input (img shape: 28*28)
	num_classes = 10 # MNIST total classes (0-9 digits)

	train_filenames = [hdfs.project_path() + "mnist/train.tfrecords"]
	validation_filenames = [hdfs.project_path() + "mnist/validation.tfrecords"]

	# Create the neural network
	def conv_net(x_dict, n_classes, dropout, reuse, is_training):

	    # Define a scope for reusing the variables
	    with tf.variable_scope('ConvNet', reuse=reuse):
# TF Estimator input is a dict, in case of multiple inputs
		x = x_dict

# MNIST data input is a 1-D vector of 784 features (28*28 pixels)
# Reshape to match picture format [Height x Width x Channel]
# Tensor input become 4-D: [Batch Size, Height, Width, Channel]
		x = tf.reshape(x, shape=[-1, 28, 28, 1])
        #Gets an existing variable with these parameters or create a new one under some variable_scope
		W1 = tf.get_variable('W1',initializer=tf.truncated_normal([5, 5, 1, 4], stddev=0.1))
		B1 = tf.get_variable('B1',initializer=tf.zeros([4]))
		W2 = tf.get_variable('W2',initializer=tf.truncated_normal([5, 5, 4, 8], stddev=0.1))
		B2 = tf.get_variable('B2',initializer=tf.zeros([8]))
		W3 = tf.get_variable('W3',initializer=tf.truncated_normal([4, 4, 8, 12], stddev=0.1))
		B3 = tf.get_variable('B3',initializer=tf.zeros([12]))
		W4 = tf.get_variable('W4',initializer=tf.truncated_normal([588, 200], stddev=0.1))
		B4 = tf.get_variable('B4',initializer=tf.zeros([200]))
		W5 = tf.get_variable('W5',initializer=tf.truncated_normal([200, 10], stddev=0.1))
		B5 = tf.get_variable('B5',initializer=tf.zeros([10]))


# 2. Define the model
#Note that to define a single value placeholder, a scalar, shape is () or [], not 0
		pkeep = 1
		if is_training == True:
			pkeep = dropout
		Y1 = tf.nn.relu(tf.nn.conv2d(x, W1, strides=[1,1,1,1], padding="SAME")+B1)
		Y1d = tf.nn.dropout(Y1, pkeep)
#as strides every 2x2 steps, only half of the values of the previous output 28x28 are kept: 14x14
		Y2 = tf.nn.relu(tf.nn.conv2d(Y1d, W2, strides=[1,2,2,1],padding="SAME")+B2)
		Y2d = tf.nn.dropout(Y2, pkeep)
		Y3 = tf.nn.relu(tf.nn.conv2d(Y2d, W3, strides=[1,2,2,1],padding="SAME")+B3)
		Y3d = tf.nn.dropout(Y3, pkeep)
#588=7x7x12 (12 output channels)
		Y3d_reshape = tf.reshape(Y3d, shape=[-1,588])
		Y4 = tf.nn.relu(tf.matmul(Y3d_reshape, W4) + B4)
		Y4d = tf.nn.dropout(Y4, pkeep)

		Ylogits = tf.matmul(Y4d, W5) + B5

	    return Ylogits
    


	# Define the model function (following TF Estimator Template)
	def model_fn(features, labels, mode, params):

	    # Build the neural network
	    # Because Dropout have different behavior at training and prediction time, we
	    # need to create 2 distinct computation graphs that still share the same weights.
	    logits_train = conv_net(features, num_classes, dropout, reuse=False, is_training=True)
	    logits_test = conv_net(features, num_classes, dropout, reuse=True, is_training=False)



	    # Predictions
	    pred_classes = tf.argmax(logits_test, axis=1)
	    pred_probas = tf.nn.softmax(logits_test)

	    # If prediction mode, early return
	    if mode == tf.estimator.ModeKeys.PREDICT:
		return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)

	    # Define loss and optimizer
	    loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
		logits=logits_train, labels=tf.cast(labels, dtype=tf.int32)))
	    lr = tf.train.exponential_decay(learning_rate, tf.train.get_global_step(),100000,0.96)
	    optimizer = tf.train.AdamOptimizer(learning_rate=lr)
	    train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())

	    # Evaluate the accuracy of the model
	    acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)

	    image = tf.reshape(features[:10], [-1, 28, 28, 1])
	    tf.summary.image("image", image)
	    # tf.summary.scalar('my_accuracy', acc_op[0])

	    # TF Estimators requires to return a EstimatorSpec, that specify
	    # the different ops for training, evaluating, ...
	    estim_specs = tf.estimator.EstimatorSpec(
	      mode=mode,
	      predictions=pred_classes,
	      loss=loss_op,
	      train_op=train_op,
	      eval_metric_ops={'accuracy': acc_op})

	    return estim_specs


	def data_input_fn(filenames, batch_size=128, shuffle=False, repeat=None):

	    def parser(serialized_example):
		"""Parses a single tf.Example into image and label tensors."""
		features = tf.parse_single_example(
		    serialized_example,
		    features={
		        'image_raw': tf.FixedLenFeature([], tf.string),
		        'label': tf.FixedLenFeature([], tf.int64),
		    })
		image = tf.decode_raw(features['image_raw'], tf.uint8)
		image.set_shape([28 * 28])

		# Normalize the values of the image from the range [0, 255] to [-0.5, 0.5]
		image = tf.cast(image, tf.float32) / 255 - 0.5
		label = tf.cast(features['label'], tf.int32)
		return image, label

	    def _input_fn():
		# Import MNIST data
		dataset = tf.contrib.data.TFRecordDataset(filenames)

		# Map the parser over dataset, and batch results by up to batch_size
		dataset = dataset.map(parser, num_threads=1, output_buffer_size=batch_size)
		if shuffle:
		    dataset = dataset.shuffle(buffer_size=128)
		dataset = dataset.batch(batch_size)
		dataset = dataset.repeat(repeat)
		iterator = dataset.make_one_shot_iterator()

		features, labels = iterator.get_next()

		return features, labels

	    return _input_fn


	run_config = tf.contrib.learn.RunConfig(
	    model_dir=tensorboard.logdir(),
	    save_checkpoints_steps=10,
	    save_summary_steps=5,
	    log_step_count_steps=10)

	hparams = tf.contrib.training.HParams(
        learning_rate=learning_rate, dropout_rate=dropout)

	summary_hook = tf.train.SummarySaverHook(
	      save_steps = run_config.save_summary_steps,
	      scaffold= tf.train.Scaffold(),
	      summary_op=tf.summary.merge_all())

	mnist_estimator = tf.estimator.Estimator(
	    model_fn=model_fn,
	    config=run_config,
	    params=hparams
	)


	train_input_fn = data_input_fn(train_filenames[0], batch_size=batch_size)
	eval_input_fn = data_input_fn(validation_filenames[0], batch_size=batch_size)

	experiment = tf.contrib.learn.Experiment(
	    mnist_estimator,
	    train_input_fn=train_input_fn,
	    eval_input_fn=eval_input_fn,
	    train_steps=num_steps,
	    min_eval_frequency=5,
	    eval_hooks=[summary_hook]
	)

	experiment.train_and_evaluate()


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1135,application_1511276242554_0472,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [2]:
from hops import util

#Define dict for hyperparameters
args_dict = {'learning_rate': [0.002,0.001,0.0005], 'dropout': [0.3,0.6,0.75]}

# Generate a grid for the given hyperparameters
args_dict_grid = util.grid_params(args_dict)

print(args_dict_grid)

{'learning_rate': [0.002, 0.002, 0.002, 0.001, 0.001, 0.001, 0.0005, 0.0005, 0.0005], 'dropout': [0.3, 0.6, 0.75, 0.3, 0.6, 0.75, 0.3, 0.6, 0.75]}

In [3]:
from hops import tflauncher
import timeit
from datetime import datetime


print("{0} ===== Start".format(datetime.now().isoformat()))
start_time = timeit.default_timer()
tensorboard_hdfs_logdir = tflauncher.launch(spark, wrapper, args_dict_grid)
elapsed = timeit.default_timer() - start_time
print "Elapsed time: " + str(elapsed)
print("{0} ===== Stop".format(datetime.now().isoformat()))

2017-11-29T00:06:41.531667 ===== Start
Finished TensorFlow job 

Make sure to check /Logs/TensorFlow/application_1511276242554_0472/runId.0 for logfile and TensorBoard logdir
Elapsed time: 82.5034301281
2017-11-29T00:08:04.035736 ===== Stop

Optimal parameters: 0.002 and 0.75

In [5]:
from hops import tensorboard

# Visualize all TensorBoard events for the jobs in the same TensorBoard
tensorboard.visualize(spark, tensorboard_hdfs_logdir)

An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job 2 cancelled 
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1517)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1457)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1704)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1687)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1676)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	a