# Iceberg Classification Step 2: Model Training with Single GPU
The following code includes demonstration for:
- get data from ``feature store``
- training with ``TFRecord`` on a single GPU
- ablation study

In [1]:
import tensorflow as tf
print("Version of TensorFlow is {}".format(tf.__version__))

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
61,application_1574692443370_0062,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.
Version of TensorFlow is 1.14.0

In [2]:
from hops import featurestore
from hops import tensorboard
import maggy
from maggy import experiment

You are running maggy on Hopsworks.

In [3]:
def create_tf_dataset(tfrecord_path, name_list):
    dataset_dir = featurestore.get_training_dataset_path(tfrecord_path)
    input_files = tf.gfile.Glob(dataset_dir + "/part-r-*")
    dataset = tf.data.TFRecordDataset(input_files)
    # 'tf_record_schema' is needed because we need to parse a single example from all the TFRecords we have
    tf_record_schema = featurestore.get_training_dataset_tf_record_schema(tfrecord_path)

    def decode(example_proto):
        example = tf.parse_single_example(example_proto, tf_record_schema)
        x = tf.stack([example[name_list[0]], example[name_list[1]], example[name_list[2]]], axis=1)
        x = tf.reshape(x, [75, 75, 3])
        y = [tf.cast(example[name_list[3]], tf.float32)]
        return x,y
    
    dataset = dataset.map(decode).shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE).repeat(NUM_EPOCHS)
    return dataset

In [4]:
def create_model():
    model = tf.keras.models.Sequential()
    
    #Conv Layer 1
    model.add(tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=INPUT_SHAPE, name='my_conv_1'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2,2), name='my_maxpool_1'))
    model.add(tf.keras.layers.Dropout(0.2, name='my_dropout_1'))

    #Conv Layer 2
    model.add(tf.keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu', name='my_conv_2'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='my_maxpool_2'))
    model.add(tf.keras.layers.Dropout(0.2, name='my_dropout_2'))

    #Conv Layer 3
    model.add(tf.keras.layers.Conv2D(128, kernel_size=(3, 3), activation='relu', name='my_conv_3'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='my_maxpool_3'))
    model.add(tf.keras.layers.Dropout(0.2, name='my_dropout_3'))

    #Conv Layer 4
    model.add(tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu', name='my_conv_4'))
    model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(2, 2), name='my_maxpool_4'))
    model.add(tf.keras.layers.Dropout(0.2, name='my_dropout_4'))

    #Flatten the data for upcoming dense layers
    model.add(tf.keras.layers.Flatten())

    #Dense Layers
    model.add(tf.keras.layers.Dense(512))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.2))

    #Dense Layer 2
    model.add(tf.keras.layers.Dense(256))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.Dropout(0.2))

    #Sigmoid Layer
    model.add(tf.keras.layers.Dense(1))
    model.add(tf.keras.layers.Activation('sigmoid'))
    return model

In [5]:
# Hyperparameter for TFRecords
NUM_EPOCHS = 50 # sina
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 10000
# # Hyperparameter for learning rate
# LEARNING_RATE = 0.001
# Input shape of the model
INPUT_SHAPE= (75, 75, 3)

## Ablation Study Part

In [6]:
# create an AblationStudy instance
from maggy.ablation import AblationStudy

In [7]:
iceberg_ablation = AblationStudy('iceberg', training_dataset_version=1, label_name="is_iceberg",
                                 dataset_generator=create_tf_dataset) # sina
# do we need to specify label_name for model ablation study?

# pass the model generator function to ablation study
# set the base model generator
iceberg_ablation.model.set_base_model_generator(create_model)

In [8]:
# # add layers to the ablation study
# iceberg_ablation.model.layers.include(['my_conv_1', 'my_conv_2', 'my_conv_3', 'my_conv_4'])
# iceberg_ablation.model.layers.include(['my_maxpool_1', 'my_maxpool_2', 'my_maxpool_3', 'my_maxpool_4'])
# iceberg_ablation.model.layers.include(['my_dropout_1', 'my_dropout_2', 'my_dropout_3', 'my_dropout_4'])

# iceberg_ablation.model.layers.print_all()

# add a layer group using a prefix

iceberg_ablation.model.layers.include_groups(prefix='my_conv')
iceberg_ablation.model.layers.include_groups(prefix='my_maxpool')
iceberg_ablation.model.layers.include_groups(prefix='my_dropout')

iceberg_ablation.model.layers.print_all_groups()

Included layer groups are: 

---- All layers prefixed "my_dropout"
---- All layers prefixed "my_conv"
---- All layers prefixed "my_maxpool"

In [9]:
iceberg_ablation.to_dict()

{'training_dataset_name': 'iceberg', 'training_dataset_version': 1, 'label_name': 'is_iceberg', 'included_features': [], 'included_layers': [], 'custom_dataset_generator': True}

In [10]:
# model training function for ablation study
def ablation_training_fn(dataset_function, model_function):
    import tensorflow as tf
    epochs = 50 # sina
    batch_size = 10
    
    tfrecord_path = "train_tfrecords_iceberg_classification_dataset" # sina
    name_list = ["band_1", "band_2", "band_avg", "is_iceberg"] # sina
    tf_dataset = dataset_function(tfrecord_path, name_list) # sina
    
    model = create_model()
    model.compile(optimizer=tf.train.AdamOptimizer(0.001),
             loss='binary_crossentropy',
             metrics=['accuracy'])
    
    history = model.fit(tf_dataset, epochs=epochs, steps_per_epoch=50, verbose=1) # sina
    return float(history.history['acc'][-1])

In [None]:
# launch the experiment

result = experiment.lagom(map_fun=ablation_training_fn, experiment_type='ablation',
                           ablation_study=iceberg_ablation, 
                           ablator='loco', 
                           name='Iceberg_ablation_study'
                          )

HBox(children=(IntProgress(value=0, description='Maggy experiment', max=4, style=ProgressStyle(description_wid…

0: Epoch 1/50
0: Epoch 2/50
0: Epoch 3/50
0: Epoch 4/50
0: Epoch 5/50
0: Epoch 6/50
0: Epoch 7/50
0: Epoch 8/50
0: Epoch 9/50
0: Epoch 10/50
0: Epoch 11/50
0: Epoch 12/50
0: Epoch 13/50
0: Epoch 14/50
0: Epoch 15/50
0: Epoch 16/50
0: Epoch 17/50
0: Epoch 18/50
0: Epoch 19/50
0: Epoch 20/50
0: Epoch 21/50
0: Epoch 22/50
0: Epoch 23/50
0: Epoch 24/50
0: Epoch 25/50
0: Epoch 26/50
0: Epoch 27/50
0: Epoch 28/50
0: Epoch 29/50
0: Epoch 30/50
0: Epoch 31/50
0: Epoch 32/50
0: Epoch 33/50
0: Epoch 34/50
0: Epoch 35/50
0: Epoch 36/50
0: Epoch 37/50
0: Epoch 38/50
0: Epoch 39/50
0: Epoch 40/50
0: Epoch 41/50
0: Epoch 42/50
0: Epoch 1/50
0: Epoch 2/50
0: Epoch 3/50
0: Epoch 4/50
0: Epoch 5/50
0: Epoch 6/50
0: Epoch 7/50
0: Epoch 8/50
0: Epoch 9/50
0: Epoch 10/50
0: Epoch 11/50
0: Epoch 12/50
0: Epoch 13/50
0: Epoch 14/50
0: Epoch 15/50
0: Epoch 16/50
0: Epoch 17/50
0: Epoch 18/50
0: Epoch 19/50
0: Epoch 20/50
0: Epoch 21/50
0: Epoch 22/50
0: Epoch 23/50
0: Epoch 24/50
0: Epoch 25/50
0: Epoch 26/5

# End of Step 2b