In [1]:
def plot_history(history):    
    import matplotlib.pyplot as plt

    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs = range(1, len(acc) + 1)

    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()

    plt.figure()

    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

    plt.show()

### Advanced RNN
* Recurrent Dropout: Dropout for overfitting RNNs
* Stacking recurrent layers: Increasing representational power of RNN
* Bidirectional recurrent layers: Present same information to RNN in different ways, increasing accuracy

In [2]:
import os
data_dir = r'C:\Users\pgbpr\Documents\Verusen\jena_climate'
fname = os.path.join(data_dir, 'jena_climate_2009_2016.csv')
f = open(fname)
data = f.read()
f.close()

lines = data.split('\n')
header = lines[0].split(',')
lines = lines[1:]
print(header)
print(len(lines))

['"Date Time"', '"p (mbar)"', '"T (degC)"', '"Tpot (K)"', '"Tdew (degC)"', '"rh (%)"', '"VPmax (mbar)"', '"VPact (mbar)"', '"VPdef (mbar)"', '"sh (g/kg)"', '"H2OC (mmol/mol)"', '"rho (g/m**3)"', '"wv (m/s)"', '"max. wv (m/s)"', '"wd (deg)"']
420551


In [3]:
import numpy as np

float_data = np.zeros((len(lines), len(header) - 1))
for i, line in enumerate(lines):
    values = [float(x) for x in line.split(',')[1:]]
    float_data[i, :] = values

In [None]:
from matplotlib import pyplot as plt
temp = float_data[:, 1] # temperature (in degrees celsius)
plt.plot(range(len(temp)),  temp)

In [None]:
plt.plot(range(1440), temp[:1440])

In [4]:
# normalizing the data with stupid naive unclear implementation

mean = float_data[:200000].mean(axis=0)
float_data -= mean
std = float_data[:200000].std(axis=0)
float_data /= std


### Generator
* Data: The original array of floating point data, which is normalized
* Lookback: how many timesteps back the input data should go
* delay: How many timesteps in the future the target should be
* min_index and max_index: Indices in the data array that delimit which timesteps to draw from. 
* shuffle: whether to shuffle the samples or draw them in chronological order
* step: the period, in timesteps, at which you sample data. 

In [5]:
def generator(data, lookback, delay, min_index, max_index,
              shuffle=False, batch_size=128, step=6):
    if max_index is None:
        max_index = len(data) - delay - 1

    ## Shift the starting index
    nbatch = (max_index - min_index - lookback) // batch_size
    shift = max_index - min_index - lookback - nbatch*batch_size
    min_index_trunc = min_index + shift + lookback - 1

    i = min_index_trunc
    while 1:
        if shuffle:
            rows = np.random.randint(
                min_index_trunc, max_index, size=batch_size)
        else:
            if i + batch_size >= max_index:
                i = min_index_trunc
            rows = np.arange(i, min(i + batch_size, max_index))
            i += len(rows)
        samples = np.zeros((len(rows),
                           lookback // step,
                           data.shape[-1]))
        targets = np.zeros((len(rows),))
        for j, row in enumerate(rows):
            indices = range(rows[j] - lookback, rows[j], step)
            samples[j] = data[indices]
            targets[j] = data[rows[j] + delay][1]
        yield samples, targets

In [None]:
lookback = 1440
step = 6
delay = 144
batch_size = 128
train_gen = generator(
    float_data, 
    lookback=lookback, 
    delay=delay, 
    min_index=0, 
    max_index=200000, 
    shuffle=True, 
    step=step, 
    batch_size=batch_size)
val_gen = generator(
    float_data, 
    lookback=lookback, 
    delay=delay, 
    min_index=200001, 
    max_index=300000, 
    step=step, 
    batch_size=batch_size)
test_gen = generator(
    float_data, 
    lookback=lookback, 
    delay=delay, 
    min_index=300001, 
    max_index=None, 
    step=step, 
    batch_size=batch_size)
val_steps = (300000 - 200001 - lookback) // 128
test_steps = (len(float_data) - 300001 - lookback) // 128

val_steps = 10000
test_steps = 10000

In [None]:
def evaluate_naive_method():
    batch_maes = []
    i = 0
    for step in range(val_steps):
        if i % 1000 == 0:
            print(i)
        i += 1
        samples, targets = next(val_gen)
        preds = samples[:, -1, 1]
        mae = np.mean(np.abs(preds - targets))
        batch_maes.append(mae)
    print(np.mean(batch_maes))
evaluate_naive_method() 

In [None]:
celsius_mae = 0.29 * std[1]
celsius_mae

In [None]:
from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop


model = Sequential()
model.add(layers.Flatten(input_shape=(lookback // step, float_data.shape[-1])))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(1))

# model.compile(optimizer=RMSprop(), loss='mae')
# history = model.fit_generator(train_gen, steps_per_epoch=500, epochs=20, validation_data=val_gen, 
#                                    validation_steps=val_steps)

In [None]:
model = Sequential()
model.add(layers.GRU(32, input_shape=(None, float_data.shape[-1])))
model.add(layers.Dense(1))
model.compile(optimizer=RMSprop(), loss='mae')
history = model.fit_generator(train_gen, steps_per_epoch=500,
                              callbacks = callbacks,
                              epochs=20, validation_data=val_gen, 
                              validation_steps=val_steps)



### Adding dropout and recurrent dropout to prevent overfitting

In [None]:
### Regularized GRU
model = Sequential()
model.add(layers.GRU(32, 
                     dropout=0.2,
                     recurrent_dropout=0.2,
                     input_shape=(None, float_data.shape[-1])))
model.add(layers.Dense(1))
model.compile(optimizer=RMSprop(), loss='mae')
history = model.fit_generator(train_gen, steps_per_epoch=500,
                              callbacks = callbacks,
                              epochs=5
, validation_data=val_gen, 
                              validation_steps=val_steps)
plot_history(history)

### Once overfitting has been dealt with, try adding another recurrent layer
Recurrent layer stacking is a classic way to build more-powerful RNNs.
Google translate is a stack of 7 large LSTM layers (huge)

To stack recurrent layers:
Must return a full 3D tensor (return_sequences = True)
And you must apparently add an activation to the final layer

In [None]:
### Regularized GRU, with additional layer
model = Sequential()
model.add(layers.GRU(32, 
                     dropout=0.2,
                     recurrent_dropout=0.2,
                     return_sequences=True,
                     input_shape=(None, float_data.shape[-1])))
model.add(layers.GRU(64, activation='relu',
                     dropout=0.1,
                     recurrent_dropout=0.5))
model.add(layers.Dense(1))
model.compile(optimizer=RMSprop(), loss='mae')
history = model.fit_generator(train_gen, steps_per_epoch=500,
                              callbacks = callbacks,
                              epochs=20, validation_data=val_gen, 
                              validation_steps=val_steps)
plot_history(history)

### Bidirectional RNNs

Used FREQUENTLY in NLP! {So it's super important to me}
RNNs are order dependent. Shuffling / reversing the timesteps can completely change the representations learned by the RNN.

A bidirectional rnn: two regular RNNs (GRU or LSTM). One processes the input sequence in one direction, and another processes it in the opposite direction.

Bidirectional RNNs can catch patterns that may be overlooked by unidirectional RNN

In [None]:
# All you need to do is...
def generator:
    ...
    ...
    yield samples[:, ::-1, :], targets
    
### However - this will UNDERPERFORM the previous method. The underlying GRU layer will typically be better at
### remembering the RECENT PAST than the DISTANT past. More recent data points are more predictive than older
### data points


### Reversed order RNN (IMDB example)

This is missing dropout and recurrent_dropout in the first LSTM layer

In [13]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras import layers
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [EarlyStopping(monitor='val_loss', patience=2),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]

max_features = 10000
maxlen = 500

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

x_train = [x[::-1] for x in x_train]
x_test = [x[::-1] for x in x_test]

x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

model = Sequential()
model.add(layers.Embedding(max_features, 128))
model.add(layers.LSTM(32))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [None]:
history = model.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
 3072/20000 [===>..........................] - ETA: 2:14 - loss: 0.1139 - acc: 0.9583

### Bidirectional LSTM example

In [None]:
model = Sequential()
model.add(layers.Embedding(max_features, 32))
model.add(layers.Bidirectional(layers.LSTM(32)))
model.add(layers.Dense(1, activation='Sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

### Going further
Adjust the # of units in each recurrent layer in the stacked setup.
Adjust the learning rate use by the RMSprop optimizer
Try using LSTM layers instead of GRU layers
Try using a bigger densely connected regressor on top of the recurrent layers: bigger Dense Layer or stack of Dense Layers
Run the best-performing models against the test set to ensure no overfitting to validation set


### Wrapping up
* First establish common sense baseline. Dummy Regressors
* Try simple models before expensive ones
* If temporal order matters, use RNNs
* Add dropout to RNNs by using a time-constant dropout mas and a recurrent dropout mask (dropout, recurrent_dropout)
* Stacked RNNs provide more representational power, much more expensive, not always worth it. 
* Bidirectional RNNs are useful for natural language processing

# Important - Recurrent Attention and Sequence Masking (NLP)