# Lab 9: Exercise solutions

In [1]:
with open('../course/common.py') as fin:
    exec(fin.read())

In [2]:
with open('../course/matplotlibconf.py') as fin:
    exec(fin.read())

## Exercise 1

In [Exercise 2 of Lab 8](8_NLP_and_Text_Data.ipynb#Exercise-2) we introduced a model for sentiment analysis of the [IMDB](www.imdb.com) dataset provided in Keras. 

- Reload that dataset and prepare it for training a model:
    - choose vocabulary size
    - pad the sequences to a fixed length
- define a function `recurrent_model(vocab_size, maxlen)` similar to the `convolutional_model` function defined earlier. The function should return a recurrent model.
- Train the model on 1 CPU and measure the training time
> TIP: This is currently broken. There's an [issue](https://github.com/tensorflow/tensorflow/issues/26245) open about it. The model definition seems to ignore the context setter on the CPU. Just skip this point for now.
- Train the model on 1 GPU and measure the training time
- Train the model on a machine with more than 1 GPU using `multi_gpu_model` or even better using distribution strategy



In [3]:
from time import time
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import multi_gpu_model

In [4]:
vocab_size= 10000
maxlen=80

In [5]:
(X_train, y_train), (X_test, y_test) = \
    imdb.load_data(num_words=vocab_size)

X_train_pad = pad_sequences(X_train, maxlen=maxlen)
X_test_pad = pad_sequences(X_test, maxlen=maxlen)

In [6]:
def recurrent_model(vocab_size, maxlen):
    print("Defining recurrent model")
    t0 = time()
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=maxlen))
    model.add(LSTM(64, dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))
    
    print("{:0.3f} seconds.".format(time() - t0))

    print("Compiling the model...")
    t0 = time()
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print("{:0.3f} seconds.".format(time() - t0))
    return model

In [7]:
# broken in TF 2.0 alpha release
# with tf.device('cpu:0'):
#    model = recurrent_model(vocab_size, maxlen)

In [8]:
# print("Training recurrent CPU model...")
# t0 = time()
# model.fit(X_train_pad, y_train,
#           batch_size=1024,
#           epochs=2,
#           shuffle=True)
# print("{:0} seconds.".format(time() - t0))

In [9]:
with tf.device('gpu:0'):
    model = recurrent_model(vocab_size, maxlen)

Defining recurrent model
0.710 seconds.
Compiling the model...
0.094 seconds.


In [10]:
print("Training recurrent GPU model...")
t0 = time()
model.fit(X_train_pad, y_train,
          batch_size=1024,
          epochs=2,
          shuffle=True)
print("{:0} seconds.".format(time() - t0))

Training recurrent GPU model...
Epoch 1/2
Epoch 2/2
4.6133668422698975 seconds.


In [11]:
NGPU = 2

In [12]:
model = recurrent_model(vocab_size, maxlen)

model = multi_gpu_model(model, NGPU, cpu_relocation=True)

Defining recurrent model
0.373 seconds.
Compiling the model...
0.095 seconds.


In [13]:
model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

In [14]:
print("Training recurrent GPU model on {} GPUs ...".format(NGPU))
t0 = time()
model.fit(X_train_pad, y_train,
          batch_size=1024*NGPU,
          epochs=2,
          shuffle=True)
print("{:0} seconds.".format(time() - t0))

Training recurrent GPU model on 2 GPUs ...
Epoch 1/2
Epoch 2/2
5.124483823776245 seconds.


In [15]:
strategy = tf.distribute.MirroredStrategy()

In [16]:
with strategy.scope():
    model = recurrent_model(vocab_size, maxlen)

Defining recurrent model
0.330 seconds.
Compiling the model...
1.180 seconds.


In [17]:
print("Training recurrent GPU model on {} GPUs ...".format(NGPU))
t0 = time()
model.fit(X_train_pad, y_train,
          batch_size=1024*NGPU,
          epochs=2,
          shuffle=True)
print("{:0.3f} seconds.".format(time() - t0))

Training recurrent GPU model on 2 GPUs ...
Epoch 1/2
Epoch 2/2
8.852 seconds.


## Exercise 2

_Model parallelism_ is a technique used for models too large to fit in the memory of a single GPU. While this is is not the case for the model we developed in Exercise 1, it is still possible to distribute the model across multiple GPUs using the with context setter. Define a new model with the following architecture:

1. Embedding
- LSTM
- LSTM
- LSTM
- Dense

Place layers 1 and 2 on the first GPU, layers 3 and 4 on the second GPU and the final Dense layer on the CPU.

Train the model and see if the performance improves.

In [18]:
import tensorflow.keras.backend as K

In [19]:
K.clear_session()

In [20]:
model = Sequential()
with tf.device('gpu:0'):
    model.add(Embedding(input_dim=vocab_size,
                        output_dim=100,
                        input_length=maxlen))
    model.add(LSTM(64, dropout=0.2,
                   return_sequences=True))
with tf.device('gpu:1'):
    model.add(LSTM(64, dropout=0.2,
                   return_sequences=True))
    model.add(LSTM(64, dropout=0.2))
with tf.device('cpu:0'):
    model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

print("{:0.3f} seconds.".format(time() - t0))


print("Compiling the model...")
t0 = time()
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

print("{:0.3f} seconds.".format(time() - t0))

9.663 seconds.
Compiling the model...
0.133 seconds.


In [21]:
print("Training distributed recurrent model...")
t0 = time()
model.fit(X_train_pad, y_train,
          batch_size=1024,
          epochs=2,
          shuffle=True)
print("{:0} seconds.".format(time() - t0))

Training distributed recurrent model...
Epoch 1/2
Epoch 2/2
7.637561559677124 seconds.
