# In this notebook we will perform training of adversarial anomaly detection model. 
#### For more details about this model refer to https://arxiv.org/pdf/1905.11034.pdf.

In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
39,application_1612198280619_0009,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7f287e8f5a90>

## Connect to hsfs and retrieve datasets for training and evaluation 

In [2]:
import hsfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

ben_td = fs.get_training_dataset("gan_non_sar_training_df", 1)
eval_td = fs.get_training_dataset("gan_eval_df", 1)

Connected. Call `.close()` to terminate connection gracefully.

In [3]:
#ben_td.read().count()

In [4]:
from hops import experiment
from hops import hdfs
import json
import tensorflow as tf
"""
emb_best_hyperparams_path = "Resources/embeddings_best_hp.json"
emb_best_hyperparams = json.loads(hdfs.load(emb_best_hyperparams_path))
emb_args_dict = {}
for key in emb_best_hyperparams.keys():
    emb_args_dict[key] = [emb_best_hyperparams[key]]

    
best_hyperparams_path = "Resources/gan_best_hp.json"
best_hyperparams = json.loads(hdfs.load(best_hyperparams_path))
args_dict = {}
for key in best_hyperparams.keys():
    args_dict[key] = [best_hyperparams[key]]
"""

emb_args_dict = {'walk_number': [40], 'walk_length': [5], 'emb_size': [128]}
args_dict = {"a": [2], "b": [6], "c": [0], "d": [5], "e": [3], "f": [2], "g": [2], "h": [3], "i": [1], "j": [0], "k": [3], "l": [2], "m": [2], "n": [2], "o": [1], "p": [1], "q": [1], "r": [2], "s": [0]}    
#args_dict = {"a": [3], "b": [3], "c": [1], "d": [4], "e": [1], "f": [2], "g": [2], "h": [3], "i": [0], "j": [1], "k": [3], "l": [6], "m": [2], "n": [2], "o": [1], "p": [1], "q": [2], "r": [4], "s": [0]}

### Define hopsworks experiments wrapper function and put all the training logic there. 

In [5]:
def experiment_wrapper():
    
    import os
    import sys
    import uuid
    import random    
    
    import tensorflow as tf
    from adversarialaml.gan_enc_ano import GanAnomalyDetector,  GanAnomalyMonitor 
    from hops import tensorboard
    from hops import model as hops_model
        
    ####################################################################
    latent_dim = args_dict['a'][0]
    discriminator_n_layers = args_dict['b'][0]
    discriminator_batch_norm = args_dict['c'][0]
    discriminator_dropout_rate = args_dict['d'][0]
    discriminator_learning_rate = args_dict['e'][0]
    discriminator_extra_steps = args_dict['f'][0]

    generator_start_n_units = args_dict['g'][0]
    generator_n_layers = args_dict['h'][0]
    generator_activation_fn = args_dict['i'][0]
    generator_batch_norm = args_dict['j'][0]
    generator_dropout_rate = args_dict['k'][0] 
    generator_learning_rate = args_dict['l'][0]

    encoder_start_n_units = args_dict['m'][0] 
    encoder_n_layers = args_dict['n'][0]
    encoder_activation_fn = args_dict['o'][0]
    encoder_batch_norm = args_dict['p'][0]
    encoder_dropout_rate = args_dict['q'][0] 
    encoder_learning_rate = args_dict['r'][0]
    
    discriminator_activation_fn = args_dict['s'][0]
    
    int_to_latent_dim= {1:16, 2:32, 3:64}
    latent_dim = int_to_latent_dim[latent_dim]
    #int_to_discriminator_n_layers= [2, 6],
    #int_to_discriminator_batch_norm= [0, 1], 
    int_to_discriminator_dropout_rate={
        1: 0.1,
        2: 0.15,
        3: 0.2,
        4: 0.25,
        5: 0.3,
        6: 0.35,
        7: 0.4,
        8: 0.45,
        9: 0.5,
        10: 0.55,
        11: 0.6 
    }
    discriminator_dropout_rate = int_to_discriminator_dropout_rate[discriminator_dropout_rate]
    
    int_to_discriminator_learning_rate={
        1: 0.00001,
        2: 0.001,
        3: 0.0015,
        4: 0.002,
        5: 0.0025,
        6: 0.003,
        7: 0.0035,
        8: 0.004,
        9: 0.0045,
        10: 0.005,
        11: 0.0055,
        12: 0.006,
        13: 0.0065,
        14: 0.007,
        15: 0.0075,
        16: 0.008,
        17: 0.0085,
        18: 0.009,
        19: 0.0095,
        20: 0.01, 
        21: 0.02, 
        22: 0.03, 
        23: 0.04 
    }  
    discriminator_learning_rate = int_to_discriminator_learning_rate[discriminator_learning_rate]
    
    # int_to_discriminator_extra_steps=[2, 6],
    
    int_to_generator_start_n_units={1:16, 2:32, 3:64}
    generator_start_n_units = int_to_generator_start_n_units[generator_start_n_units]
    
    #int_to_generator_n_layers=[2, 6]
    
    #int_to_generator_activation_fn=[0, 1],
    
    #int_to_generator_batch_norm=[0, 1],
    
    int_to_generator_dropout_rate={
        1: 0.1,
        2: 0.15,
        3: 0.2,
        4: 0.25,
        5: 0.3,
        6: 0.35,
        7: 0.4,
        8: 0.45,
        9: 0.5,
        10: 0.55,
        11: 0.6 
    } 
    generator_dropout_rate = int_to_generator_dropout_rate[generator_dropout_rate]
    
    int_to_generator_learning_rate={
        1: 0.00001,
        2: 0.001,
        3: 0.0015,
        4: 0.002,
        5: 0.0025,
        6: 0.003,
        7: 0.0035,
        8: 0.004,
        9: 0.0045,
        10: 0.005,
        11: 0.0055,
        12: 0.006,
        13: 0.0065,
        14: 0.007,
        15: 0.0075,
        16: 0.008,
        17: 0.0085,
        18: 0.009,
        19: 0.0095,
        20: 0.01, 
        21: 0.02, 
        22: 0.03, 
        23: 0.04 
    }  
    generator_learning_rate = int_to_generator_learning_rate[generator_learning_rate]

    int_to_encoder_start_n_units={1:16, 2:32, 3:64}
    encoder_start_n_units = int_to_encoder_start_n_units[encoder_start_n_units]
    
    #int_to_encoder_n_layers=[2, 6]
    #int_to_encoder_activation_fn=[0, 1]
    #int_to_encoder_batch_norm=[0, 1]
    
    int_to_encoder_dropout_rate={
        1: 0.1,
        2: 0.15,
        3: 0.2,
        4: 0.25,
        5: 0.3,
        6: 0.35,
        7: 0.4,
        8: 0.45,
        9: 0.5,
        10: 0.55,
        11: 0.6 
    }
    encoder_dropout_rate = int_to_encoder_dropout_rate[encoder_dropout_rate]
    
    int_to_encoder_learning_rate={
        1: 0.00001,
        2: 0.001,
        3: 0.0015,
        4: 0.002,
        5: 0.0025,
        6: 0.003,
        7: 0.0035,
        8: 0.004,
        9: 0.0045,
        10: 0.005,
        11: 0.0055,
        12: 0.006,
        13: 0.0065,
        14: 0.007,
        15: 0.0075,
        16: 0.008,
        17: 0.0085,
        18: 0.009,
        19: 0.0095,
        20: 0.01, 
        21: 0.02, 
        22: 0.03, 
        23: 0.04 
    }  
    encoder_learning_rate = int_to_encoder_learning_rate[encoder_learning_rate]
    
    ####################################################################

    # Define distribution strategy
    strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() 

    options = tf.data.Options()
    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
    
    # Define per device batch size
    batch_size_per_replica = 64
    # Define global batch size
    batch_size = batch_size_per_replica * strategy.num_replicas_in_sync
    total_samples = 6366
    
    # Set the number of epochs for trainining.
    EPOCHS = 30
    
    if discriminator_activation_fn==0:
        discriminator_activation_fn="linear"
    else:    
        discriminator_activation_fn="selu"
    
    if discriminator_dropout_rate > 0.0:
        discriminator_batch_dropout = True
    else:
        discriminator_batch_dropout = False
    

    if discriminator_dropout_rate > 0.0:
        generator_batch_dropout=True
    else:
        generator_batch_dropout=False

    if encoder_dropout_rate > 0.0:
        encoder_batch_dropout=True
    else:
        encoder_batch_dropout=False   


    if discriminator_batch_norm==0:
        discriminator_batch_norm = False
    else:
        discriminator_batch_norm = True
        
    if generator_activation_fn==0:
        generator_activation_fn="tanh"
    else:    
        generator_activation_fn="selu"
        
    if generator_batch_norm==0:
        generator_batch_norm = False
    else:
        generator_batch_norm = True
        
    if encoder_activation_fn==0:
        encoder_activation_fn="tanh"
    else:    
        encoder_activation_fn="selu"
        
    if encoder_batch_norm==0:
        encoder_batch_norm=False
    else:
        encoder_batch_norm=True        
        

    discriminator_double_neurons=False
    discriminator_bottleneck_neurons=True
    generator_double_neurons=True
    generator_bottleneck_neurons=False

    train_input = ben_td.tf_data(target_name='target', is_training=True)
    train_input_processed = train_input.tf_record_dataset(process=True, batch_size=batch_size, num_epochs=EPOCHS)
    train_input_processed  = train_input_processed.with_options(options)

    eval_input = ben_td.tf_data(target_name='target', is_training=True)
    eval_input_processed = eval_input.tf_record_dataset(process=True, batch_size=1, num_epochs=EPOCHS)    
    eval_input_processed  = eval_input_processed.with_options(options)
    
    # construct model under distribution strategy scope
    with strategy.scope():    
        
        # Instantiate the GanAnomalyDetector model.
        gan_anomaly_detector = GanAnomalyDetector(
                input_dim=emb_args_dict['emb_size'][0],
                latent_dim=latent_dim,

                discriminator_start_n_units=emb_args_dict['emb_size'][0],
                discriminator_n_layers=discriminator_n_layers,
                discriminator_activation_fn=discriminator_activation_fn,
                discriminator_double_neurons=discriminator_double_neurons,
                discriminator_bottleneck_neurons=discriminator_bottleneck_neurons,
                discriminator_batch_norm=discriminator_batch_norm,
                discriminator_batch_dropout=discriminator_batch_dropout,
                discriminator_dropout_rate=discriminator_dropout_rate,
                discriminator_learning_rate=discriminator_learning_rate,
                discriminator_extra_steps=discriminator_extra_steps,

                generator_start_n_units=generator_start_n_units,
                generator_n_layers=generator_n_layers,
                generator_activation_fn=generator_activation_fn,
                generator_double_neurons=generator_double_neurons,
                generator_bottleneck_neurons=generator_bottleneck_neurons,
                generator_batch_norm=generator_batch_norm,
                generator_batch_dropout=generator_batch_dropout,
                generator_dropout_rate=generator_dropout_rate,
                generator_learning_rate=generator_learning_rate,

                encoder_start_n_units=encoder_start_n_units,
                encoder_n_layers=encoder_n_layers,
                encoder_activation_fn=encoder_activation_fn,
                encoder_batch_norm=encoder_batch_norm,
                encoder_batch_dropout=encoder_batch_dropout,
                encoder_dropout_rate=encoder_dropout_rate,
                encoder_learning_rate=encoder_learning_rate,

    )
    

        # Compile the WGAN model.
        gan_anomaly_detector.compile()

    callbacks = [
        GanAnomalyMonitor(batch_size=1, latent_dim=latent_dim, input_dim=emb_args_dict['emb_size'][0], alpha=0.7, real_data=eval_input_processed),
        tf.keras.callbacks.TensorBoard(log_dir=tensorboard.logdir()),
        tf.keras.callbacks.ModelCheckpoint(filepath=tensorboard.logdir()),
    ]
    # Start training the model.
    history = gan_anomaly_detector.fit(train_input_processed, callbacks=[callbacks], epochs=EPOCHS, steps_per_epoch= 4096) #, steps_per_epoch=total_samples//batch_size

    metrics={'loss': history.history["d_loss"][0]} 

    # save to the model registry
    export_path = os.getcwd() + '/model-' + str(uuid.uuid4())
    print('Exporting trained model to: {}'.format(export_path))
    
    call = gan_anomaly_detector.serve_function.get_concrete_function(tf.TensorSpec([None,None], tf.float32))
    tf.saved_model.save(gan_anomaly_detector, export_path, signatures=call)

    print('Done exporting!')
        
    hops_model.export(export_path, 'ganAml', metrics=metrics)
    
    return metrics

## Use above experiments wrapper function to conduct hops training experiments.

In [6]:
from hops import experiment
    
experiment.launch(experiment_wrapper,  name='train_gan_aml', metric_key='loss', local_logdir=False)

An error was encountered:
An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, davitamlgpu-worker-1.internal.cloudapp.net, executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2499, in pipeline_func
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2499, in pipeline_func
  File "/srv/hops/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 2499, in pipeline_func
  [Previous line repeated 1 more time]
  File "/srv/hops/sp