# In this notebook we will perform hyperparameter tuning and training of adversarial anomaly detection model. 
#### For more details about this model refer to https://arxiv.org/pdf/1905.11034.pdf.

In [1]:
spark

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log
8,application_1607892309705_0009,pyspark,idle,Link,Link


SparkSession available as 'spark'.
<pyspark.sql.session.SparkSession object at 0x7fa5b72c5f10>

## Connect to hsfs and retrieve datasets for training and evaluation 

In [2]:
import hsfs
# Create a connection
connection = hsfs.connection()
# Get the feature store handle for the project's feature store
fs = connection.get_feature_store()

ben_td = fs.get_training_dataset("gan_non_sar_training_df", 1)
eval_td = fs.get_training_dataset("gan_eval_df", 1)

Connected. Call `.close()` to terminate connection gracefully.

In [3]:
from hops import experiment
from hops import hdfs
import json
best_hyperparams_path = "Resources/embeddings_best_hp.json"
best_hyperparams = json.loads(hdfs.load(best_hyperparams_path))
args_dict = {}
for key in best_hyperparams.keys():
    args_dict[key] = [best_hyperparams[key]]

### Define hopsworks experiments wrapper function and put all the training logic there. 

In [7]:
def experiment_wrapper(
    latent_dim,
    discriminator_n_layers,
    discriminator_batch_norm,
    discriminator_dropout_rate, 
    discriminator_learning_rate,
    discriminator_extra_steps,

    generator_start_n_units,
    generator_n_layers,
    generator_activation_fn,
    generator_batch_norm,
    generator_dropout_rate, 
    generator_learning_rate,

    encoder_start_n_units,
    encoder_n_layers,
    encoder_activation_fn,
    encoder_batch_norm,
    encoder_dropout_rate, 
    encoder_learning_rate):
        
    import tensorflow as tf
    from adversarialaml.gan_enc_ano import GanAnomalyDetector,  GanAnomalyMonitor 
    from hops import tensorboard

    # Set the number of epochs for trainining.
    EPOCHS = 2

    train_input = ben_td.tf_data(target_name='target', is_training=True)
    train_input_processed = train_input.tf_record_dataset(process=True, batch_size=16, num_epochs=EPOCHS)
    eval_input = ben_td.tf_data(target_name='target', is_training=True)
    eval_input_processed = eval_input.tf_record_dataset(process=True, batch_size=1, num_epochs=EPOCHS)    
    
    if discriminator_dropout_rate > 0.0:
        discriminator_batch_dropout = True
    else:
        discriminator_batch_dropout = False
    

    if discriminator_dropout_rate > 0.0:
        generator_batch_dropout=True
    else:
        generator_batch_dropout=False

    if encoder_dropout_rate > 0.0:
        encoder_batch_dropout=True
    else:
        encoder_batch_dropout=False   


    if discriminator_batch_norm==0:
        discriminator_batch_norm = False
    else:
        discriminator_batch_norm = True
        
    if generator_activation_fn==0:
        generator_activation_fn="tanh"
    else:    
        generator_activation_fn="relu"
        
    if generator_batch_norm==0:
        generator_batch_norm = False
    else:
        generator_batch_norm = True
        
    if encoder_activation_fn==0:
        encoder_activation_fn="tanh"
    else:    
        encoder_activation_fn="relu"
        
    if encoder_batch_norm==0:
        encoder_batch_norm=False
    else:
        encoder_batch_norm=True        
        

    discriminator_double_neurons=False
    discriminator_bottleneck_neurons=True
    generator_double_neurons=True
    generator_bottleneck_neurons=False
        
    # Instantiate the GanAnomalyDetector model.
    gan_anomaly_detector = GanAnomalyDetector(
                input_dim=args_dict['emb_size'][0],
                latent_dim=latent_dim,

                discriminator_start_n_units=args_dict['emb_size'][0],
                discriminator_n_layers=discriminator_n_layers,
                discriminator_activation_fn="sigmoid",
                discriminator_double_neurons=discriminator_double_neurons,
                discriminator_bottleneck_neurons=discriminator_bottleneck_neurons,
                discriminator_batch_norm=discriminator_batch_norm,
                discriminator_batch_dropout=discriminator_batch_dropout,
                discriminator_dropout_rate=discriminator_dropout_rate,
                discriminator_learning_rate=discriminator_learning_rate,
                discriminator_extra_steps=discriminator_extra_steps,

                generator_start_n_units=generator_start_n_units,
                generator_n_layers=generator_n_layers,
                generator_activation_fn=generator_activation_fn,
                generator_double_neurons=generator_double_neurons,
                generator_bottleneck_neurons=generator_bottleneck_neurons,
                generator_batch_norm=generator_batch_norm,
                generator_batch_dropout=generator_batch_dropout,
                generator_dropout_rate=generator_dropout_rate,
                generator_learning_rate=generator_learning_rate,

                encoder_start_n_units=encoder_start_n_units,
                encoder_n_layers=encoder_n_layers,
                encoder_activation_fn=encoder_activation_fn,
                encoder_batch_norm=encoder_batch_norm,
                encoder_batch_dropout=encoder_batch_dropout,
                encoder_dropout_rate=encoder_dropout_rate,
                encoder_learning_rate=encoder_learning_rate,

    )
    
    # Compile the WGAN model.
    gan_anomaly_detector.compile()
    
    history = gan_anomaly_detector.fit(train_input_processed)

    metrics={'metric': history.history["g_loss"][0]} 
    
    return metrics

## The searchspace can be instantiated with parameters

In [8]:
from maggy import Searchspace
sp = Searchspace(

    latent_dim=('INTEGER', [4, 6]),
    discriminator_n_layers=('INTEGER', [2, 3]),
    discriminator_batch_norm=('INTEGER', [0, 1]), 
    discriminator_dropout_rate=('DISCRETE', [0.0, 0.1]), 
    discriminator_learning_rate=('DISCRETE', [0.0001, 0.0002]),
    discriminator_extra_steps=('INTEGER', [2, 3]),

    generator_start_n_units=('INTEGER', [8, 10]),
    generator_n_layers=('INTEGER', [2, 3]),
    generator_activation_fn=('INTEGER', [0, 1]),
    generator_batch_norm=('INTEGER', [0, 1]),
    generator_dropout_rate=('DISCRETE', [0.0, 0.1]), 
    generator_learning_rate=('DISCRETE', [0.0001, 0.0002]),

    encoder_start_n_units=('INTEGER', [4, 6]),
    encoder_n_layers=('INTEGER', [2, 3]),
    encoder_activation_fn=('INTEGER', [0, 1]),
    encoder_batch_norm=('INTEGER', [0, 1]),
    encoder_dropout_rate=('DISCRETE', [0.0, 0.1]), 
    encoder_learning_rate=('DISCRETE', [0.0001, 0.0002]),
)

Hyperparameter added: latent_dim
Hyperparameter added: discriminator_n_layers
Hyperparameter added: discriminator_batch_norm
Hyperparameter added: discriminator_dropout_rate
Hyperparameter added: discriminator_learning_rate
Hyperparameter added: discriminator_extra_steps
Hyperparameter added: generator_start_n_units
Hyperparameter added: generator_n_layers
Hyperparameter added: generator_activation_fn
Hyperparameter added: generator_batch_norm
Hyperparameter added: generator_dropout_rate
Hyperparameter added: generator_learning_rate
Hyperparameter added: encoder_start_n_units
Hyperparameter added: encoder_n_layers
Hyperparameter added: encoder_activation_fn
Hyperparameter added: encoder_batch_norm
Hyperparameter added: encoder_dropout_rate
Hyperparameter added: encoder_learning_rate

## Use above experiments wrapper function to conduct hops training experiments.

In [9]:
from maggy import experiment
result = experiment.lagom(experiment_wrapper, 
                           searchspace=sp, 
                           optimizer='randomsearch', 
                           direction='min',
                           num_trials=2, 
                           name='ganaml',
                           hb_interval=5, 
                           es_interval=5,
                           es_min=5
                          )

WARN: Can't reach Maggy server. No progress information and logs available. Job continues running anyway.
Started Maggy Experiment: ganaml, application_1607892309705_0009, run 2

------ RandomSearch Results ------ direction(min) 
BEST combination {"latent_dim": 5, "discriminator_n_layers": 2, "discriminator_batch_norm": 1, "discriminator_dropout_rate": 0.1, "discriminator_learning_rate": 0.0001, "discriminator_extra_steps": 2, "generator_start_n_units": 8, "generator_n_layers": 2, "generator_activation_fn": 0, "generator_batch_norm": 1, "generator_dropout_rate": 0.1, "generator_learning_rate": 0.0001, "encoder_start_n_units": 5, "encoder_n_layers": 2, "encoder_activation_fn": 0, "encoder_batch_norm": 1, "encoder_dropout_rate": 0.0, "encoder_learning_rate": 0.0001} -- metric -0.6043117046356201
WORST combination {"latent_dim": 5, "discriminator_n_layers": 2, "discriminator_batch_norm": 0, "discriminator_dropout_rate": 0.1, "discriminator_learning_rate": 0.0002, "discriminator_extra_step

In [10]:
import json
from hops import hdfs
EMBEDDINGS_HYPERPARAMS_FILE = 'gan_best_hp.json'
hdfs.dump(json.dumps(result['best_hp']), "Resources/" + EMBEDDINGS_HYPERPARAMS_FILE)