# Fashion MNIST
Requires to have the tfrecords (on github, hops_notebooks folder) in a folder in datasets called mnist (to be created)

In [1]:
sep=",;,;,;"
def wrapper_mnist(learning_rate, dropout):

    import tensorflow as tf
    import numpy as np
    from hops import tensorboard
    from hops import hdfs

    # Training Parameters
    num_steps = 20
    batch_size = 128

    # Network Parameters
    num_input = 784 # MNIST data input (img shape: 28*28)
    num_classes = 10 # MNIST total classes (0-9 digits)

    train_filenames = [hdfs.project_path() + "mnist/train.tfrecords"]
    validation_filenames = [hdfs.project_path() + "mnist/validation.tfrecords"]

    # Create the neural network
    # TF Estimator input is a dict, in case of multiple inputs
    def conv_net(x, n_classes, dropout, reuse, is_training):

        # Define a scope for reusing the variables
        with tf.variable_scope('ConvNet', reuse=reuse):

            # MNIST data input is a 1-D vector of 784 features (28*28 pixels)
            # Reshape to match picture format [Height x Width x Channel]
            # Tensor input become 4-D: [Batch Size, Height, Width, Channel]
            x = tf.reshape(x, shape=[-1, 28, 28, 1])

            # Convolution Layer with 32 filters and a kernel size of 5
            conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu)
            # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
            conv1 = tf.layers.max_pooling2d(conv1, 2, 2)

            # Convolution Layer with 32 filters and a kernel size of 5
            conv2 = tf.layers.conv2d(conv1, 64, 3, activation=tf.nn.relu)
            # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
            conv2 = tf.layers.max_pooling2d(conv2, 2, 2)

            # Flatten the data to a 1-D vector for the fully connected layer
            fc1 = tf.contrib.layers.flatten(conv2)

            # Fully connected layer (in tf contrib folder for now)
            fc1 = tf.layers.dense(fc1, 1024)
            # Apply Dropout (if is_training is False, dropout is not applied)
            fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training)

            # Output layer, class prediction
            out = tf.layers.dense(fc1, n_classes)

        return out


    # Define the model function (following TF Estimator Template)
    def model_fn(features, labels, mode, params):

        # Build the neural network
        # Because Dropout have different behavior at training and prediction time, we
        # need to create 2 distinct computation graphs that still share the same weights.
        logits_train = conv_net(features, num_classes, dropout, reuse=False, is_training=True)
        logits_test = conv_net(features, num_classes, dropout, reuse=True, is_training=False)

        # Predictions
        pred_classes = tf.argmax(logits_test, axis=1)
        pred_probas = tf.nn.softmax(logits_test)

        # If prediction mode, early return
        if mode == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode, predictions=pred_classes)

        # Define loss and optimizer
        loss_op = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_train, 
                                                                                labels=tf.cast(labels, dtype=tf.int32)))
        
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        train_op = optimizer.minimize(loss_op, global_step=tf.train.get_global_step())

        # Evaluate the accuracy of the model
        acc_op = tf.metrics.accuracy(labels=labels, predictions=pred_classes)

        image = tf.reshape(features[:10], [-1, 28, 28, 1])
        tf.summary.image("image", image)

        # TF Estimators requires to return a EstimatorSpec, that specify
        # the different ops for training, evaluating, ...
        estim_specs = tf.estimator.EstimatorSpec(
          mode=mode,
          predictions=pred_classes,
          loss=loss_op,
          train_op=train_op,
          eval_metric_ops={'accuracy': acc_op})

        return estim_specs


    def data_input_fn(filenames, batch_size=128, shuffle=False, repeat=None):

        def parser(serialized_example):
            """Parses a single tf.Example into image and label tensors."""
            features = tf.parse_single_example(
                serialized_example,
                features={
                    'image_raw': tf.FixedLenFeature([], tf.string),
                    'label': tf.FixedLenFeature([], tf.int64),
                })
            image = tf.decode_raw(features['image_raw'], tf.uint8)
            image.set_shape([28 * 28])

            # Normalize the values of the image from the range [0, 255] to [-0.5, 0.5]
            image = tf.cast(image, tf.float32) / 255 - 0.5
            label = tf.cast(features['label'], tf.int32)
            return image, label

        def _input_fn():
            # Import MNIST data
            dataset = tf.contrib.data.TFRecordDataset(filenames)

            # Map the parser over dataset, and batch results by up to batch_size
            dataset = dataset.map(parser, num_threads=1, output_buffer_size=batch_size)
            if shuffle:
                dataset = dataset.shuffle(buffer_size=128)
            dataset = dataset.batch(batch_size)
            dataset = dataset.repeat(repeat)
            iterator = dataset.make_one_shot_iterator()

            features, labels = iterator.get_next()

            return features, labels

        return _input_fn


    run_config = tf.contrib.learn.RunConfig(
        model_dir=tensorboard.logdir(),
        save_checkpoints_steps=10,
        save_summary_steps=5,
        log_step_count_steps=10)

    hparams = tf.contrib.training.HParams(
        learning_rate=learning_rate, dropout_rate=dropout)

    summary_hook = tf.train.SummarySaverHook(
          save_steps = run_config.save_summary_steps,
          scaffold= tf.train.Scaffold(),
          summary_op=tf.summary.merge_all())

    mnist_estimator = tf.estimator.Estimator(
        model_fn=model_fn,
        config=run_config,
        params=hparams
    )


    train_input_fn = data_input_fn(train_filenames[0], batch_size=batch_size)
    eval_input_fn = data_input_fn(validation_filenames[0], batch_size=batch_size)

    experiment = tf.contrib.learn.Experiment(
        mnist_estimator,
        train_input_fn=train_input_fn,
        eval_input_fn=eval_input_fn,
        train_steps=num_steps,
        min_eval_frequency=5,
        eval_hooks=[summary_hook]
    )
    
    hdfs.log("Execution train and evaluate")
    experiment.train_and_evaluate()
    hdfs.log("Finished execution train and evaluate")
    #accuracy_score = mnist_estimator.evaluate(input_fn=eval_input_fn)["accuracy"]

    #accuracy_score = experiment.evaluate()
    #hdfs.log("Variable names: ")
    #var = mnist_estimator.get_variable_names()
    #for aux in var:
        #hdfs.log(aux)
    hdfs.log("Trying estimator evaluate: ")
    accuracy_score = mnist_estimator.evaluate(input_fn=eval_input_fn, steps=num_steps)["accuracy"]
    hdfs.log("Done estimator evaluate: ")
    hdfs.log(sep+str(accuracy_score)+sep)


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
3318,application_1513605045578_0591,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


In [None]:
#from hops import tflauncher
#import timeit
#args_dict = {'learning_rate': [0.0005,0.02], 'dropout': [0.7,0.03]} 
#start_time = timeit.default_timer()
#tensorboard_hdfs_logdir = tflauncher.launch(spark, wrapper_mnist, args_dict)
#elapsed = timeit.default_timer() - start_time
#print "Elapsed time: " + str(elapsed)

#start_time = timeit.default_timer()
#tensorboard_hdfs_logdir = tflauncher.launch(spark, wrapper_mnist, args_dict)
#elapsed = timeit.default_timer() - start_time
#print "Elapsed time: " + str(elapsed)

# Music ANN

In [2]:
sep=",;,;,;"
def wrapper(learning_rate, dropout):
    from hops import hdfs
    #TODO: add network here with all the import within necessary for the code within the function
    
    acc=0.87
    hdfs.log(sep+str(acc)+sep)

# Parallely executes and returns list of accuracies

In [3]:
def get_accuracy(v):
    if sep in v["_c0"]:
        i = v["_c0"].find(sep)
        substr = v["_c0"][i+len(sep):]
        i = substr.find(sep)
        return [substr[:i]]
    else:
        return []

def get_all_accuracies(tensorboard_hdfs_logdir, args_dict, number_params):
    from hops import hdfs
    print(tensorboard_hdfs_logdir)
    hdfs.log(tensorboard_hdfs_logdir)
    results=[]
    for i in range(number_params):
        path_to_log=tensorboard_hdfs_logdir+"/"
        for k in args_dict.keys():
            path_to_log+=k+"="+str(args_dict[k][i])+"."
        path_to_log+="log"
        print("Path to log: ")
        hdfs.log("Path to log: ")
        print(path_to_log)
        hdfs.log(path_to_log)
        raw = spark.read.csv(path_to_log, sep="\n")
        #raw.show(10)
        #raw.count()
        r = raw.rdd.flatMap(lambda v: get_accuracy(v)).collect()
        results.extend(r)

    #print(results)
    return [float(res) for res in results]

def execute_all(population_dict):
    from hops import tflauncher
    number_params=[len(v) for v in population_dict.values()][0]
    tensorboard_hdfs_logdir = tflauncher.launch(spark, wrapper_mnist, population_dict)
    return get_all_accuracies(tensorboard_hdfs_logdir, population_dict,number_params)

# Evolutionary algorithm for hyperparameter optimization
To run code just adapt the last fuction (parse_to_dict) to include the items you wanna optimize
Also adapt the bounds and types in the main section to reflect the parameters you wanna optimize

In [4]:
'''
Differential evolution algorithm extended to allow for categorical and integer values for optimization of hyperparameter
space in Neural Networks, including an option for parallelization.

This algorithm will create a full population to be evaluated, unlike typical differential evolution where each
individual get compared and selected sequentially. This allows the user to send a whole population of parameters
to a cluster and run computations in parallel, after which each individual gets evaluated with their respective
target or trial vector.

User will have to define:
- Objective function to be optimized
- Bounds of each parameter (all possible values)
- The Types of each parameter, in order to be able to evaluate categorical, integer or floating values.
- Direction of the optimization, i.e. maximization or minimization
- Number of iterations, i.e. the amount of generations the algorithm will run
- The population size, rule of thumb is to take between 5-10 time the amount of parameters to optimize
- Mutation faction between [0, 2)
- Crossover between [0, 1], the higher the value the more mutated values will crossover
'''

import random

class DifferentialEvolution:
    _types = ['float', 'int', 'cat']
    _generation = 0
    _scores = []

    def __init__(self, objective_function, parbounds, types, direction = 'max', maxiter=10, popsize=10, mutationfactor=0.5, crossover=0.7):
        self.objective_function = objective_function
        self.parbounds = parbounds
        self.direction = direction
        self.types = types
        self.maxiter = maxiter
        self.n = popsize
        self.F = mutationfactor
        self.CR = crossover

        #self.m = -1 if maximize else 1

    # run differential evolution algorithms
    def solve(self):
        # initialise generation based on individual representation
        population, bounds = self._population_initialisation()
        print(population)
        for _ in range(self.maxiter):
            donor_population = self._mutation(population, bounds)
            trial_population = self._recombination(population, donor_population)
            population = self._selection(population, trial_population)

            new_gen_avg = sum(self._scores)/self.n

            if self.direction == 'max':
                new_gen_best = max(self._scores)
            else:
                new_gen_best = min(self._scores)
            new_gen_best_param = self._parse_back(population[self._scores.index(new_gen_best)])

            print("Generation: ", self._generation, " || ", "Average score: ", new_gen_avg,
                  ", best score: ", new_gen_best, "best param: ", new_gen_best_param)

        parsed_back_population = []
        for indiv in population:
            parsed_back_population.append(self._parse_back(indiv))

        return parsed_back_population, self._scores

    # define bounds of each individual depending on type
    def _individual_representation(self):
        bounds = []

        for index, item in enumerate(self.types):
            b =()
            # if categorical then take bounds from 0 to number of items
            if item == self._types[2]:
                b = (0, int(len(self.parbounds[index]) - 1))
            # if float/int then take given bounds
            else:
                b = self.parbounds[index]
            bounds.append(b)
        return bounds

    # initialise population
    def _population_initialisation(self):
        population = []
        num_parameters = len(self.parbounds)
        for i in range(self.n):
            indiv = []
            bounds = self._individual_representation()

            for i in range(num_parameters):
                indiv.append(random.uniform(bounds[i][0], bounds[i][1]))
            indiv = self._ensure_bounds(indiv, bounds)
            population.append(indiv)
        return population, bounds

    # ensure that any mutated individual is within bounds
    def _ensure_bounds(self, indiv, bounds):
        indiv_correct = []

        for i in range(len(indiv)):
            par = indiv[i]

            # check if param is within bounds
            lowerbound = bounds[i][0]
            upperbound = bounds[i][1]
            if par < lowerbound:
                par = lowerbound
            elif par > upperbound:
                par = upperbound

            # check if param needs rounding
            if self.types[i] != 'float':
                par = int(round(par))
            indiv_correct.append(par)
        return indiv_correct

    # create donor population based on mutation of three vectors
    def _mutation(self, population, bounds):
        donor_population = []
        for i in range(self.n):

            indiv_indices = list(range(0, self.n))
            indiv_indices.remove(i)

            candidates = random.sample(indiv_indices, 3)
            x_1 = population[candidates[0]]
            x_2 = population[candidates[1]]
            x_3 = population[candidates[2]]

            # substracting the second from the third candidate
            x_diff = [x_2_i - x_3_i for x_2_i, x_3_i in zip(x_2, x_3)]
            donor_vec = [x_1_i + self.F*x_diff_i for x_1_i, x_diff_i in zip (x_1, x_diff)]
            donor_vec = self._ensure_bounds(donor_vec, bounds)
            donor_population.append(donor_vec)

        return donor_population

    # recombine donor vectors according to crossover probability
    def _recombination(self, population, donor_population):
        trial_population = []
        for k in range(self.n):
            target_vec = population[k]
            donor_vec = donor_population[k]
            trial_vec = []
            for p in range(len(self.parbounds)):
                crossover = random.random()

                # if random number is below set crossover probability do recombination
                if crossover <= self.CR:
                    trial_vec.append(donor_vec[p])
                else:
                    trial_vec.append(target_vec[p])
            trial_population.append(trial_vec)
        return trial_population

    # select the best individuals from each generation
    def _selection(self, population, trial_population):
        # Calculate trial vectors and target vectors and select next generation

        if self._generation == 0:
            parsed_population = []
            for target_vec in population:
                parsed_target_vec = self._parse_back(target_vec)
                parsed_population.append(parsed_target_vec)

            parsed_population = self._parse_to_dict(parsed_population)
            self._scores = self.objective_function(parsed_population)

        parsed_trial_population = []
        for index, trial_vec in enumerate(trial_population):
            parsed_trial_vec = self._parse_back(trial_vec)
            parsed_trial_population.append(parsed_trial_vec)

        parsed_trial_population =  self._parse_to_dict(parsed_trial_population)
        trial_population_scores = self.objective_function(parsed_trial_population)

        for i in range(self.n):
            trial_vec_score_i = trial_population_scores[i]
            target_vec_score_i = self._scores[i]
            if self.direction == 'max':
                if trial_vec_score_i > target_vec_score_i:
                    self._scores[index] = trial_vec_score_i
                    population[index] = trial_vec
            else:
                if trial_vec_score_i < target_vec_score_i:
                    self._scores[index] = trial_vec_score_i
                    population[index] = trial_vec

        self._generation += 1

        return population

    # parse the converted values back to original
    def _parse_back(self, individual):
        original_representation = []
        for index, parameter in enumerate(individual):
            if self.types[index] == self._types[2]:
                original_representation.append(self.parbounds[index][parameter])
            else:

                original_representation.append(parameter)

        return original_representation

    # for parallelization purposes one can parse the population from a list to a  dictionary format
    # User only has to add the parameters he wants to optimize to population_dict
    def _parse_to_dict(self, population):
        population_dict = {'learning_rate': [], 'dropout': []}
        for indiv in population:
            population_dict['learning_rate'].append(indiv[0])
            population_dict['dropout'].append(indiv[1])

        return population_dict

In [None]:
diff_evo = DifferentialEvolution(execute_all,[(0.005, 0.1),(0.1, 0.9)], ['cat', 'float'], direction='max', maxiter=2,popsize=4)

results = diff_evo.solve()

print("Population: ", results[0])
print("Scores: ", results[1])

[[1, 0.40427283854272833], [0, 0.5756949048154342], [1, 0.6219462345112103], [0, 0.34593948993453627]]
Finished TensorFlow job 

Make sure to check /Logs/TensorFlow/application_1513605045578_0591/runId.0 for logfile and TensorBoard logdir
hdfs:///Projects/MarcMusicClass/Logs/TensorFlow/application_1513605045578_0591/runId.0
Path to log: 
hdfs:///Projects/MarcMusicClass/Logs/TensorFlow/application_1513605045578_0591/runId.0/learning_rate=0.1.dropout=0.404272838543.log
Path to log: 
hdfs:///Projects/MarcMusicClass/Logs/TensorFlow/application_1513605045578_0591/runId.0/learning_rate=0.005.dropout=0.575694904815.log
Path to log: 
hdfs:///Projects/MarcMusicClass/Logs/TensorFlow/application_1513605045578_0591/runId.0/learning_rate=0.1.dropout=0.621946234511.log
Path to log: 
hdfs:///Projects/MarcMusicClass/Logs/TensorFlow/application_1513605045578_0591/runId.0/learning_rate=0.005.dropout=0.345939489935.log
Finished TensorFlow job 

Make sure to check /Logs/TensorFlow/application_151360504557