# RNN Predict Observation at Next Time Step

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

## Prepare Input Data

In [3]:
npzfile = np.load("data/train.npz", allow_pickle=True)
xraw_train = npzfile['x']
y_train = npzfile['y']
s = 4 # alphabet size

In [4]:
# convert variable length lists to fixed-length array by 
# clipping longer sequences and padding shorter sequences with -1
x_train = tf.keras.preprocessing.sequence.pad_sequences(
    xraw_train, padding="post", maxlen=99, value = -1)
# Add 1 so 0 stands for missing and 1,2,3,4 for a,c,g,t
x_train = x_train + 1

In [5]:
x_train[0]

array([2, 4, 4, 4, 3, 3, 3, 1, 3, 3, 2, 2, 1, 1, 3, 1, 4, 3, 3, 3, 4, 3,
       3, 1, 4, 2, 1, 2, 4, 4, 3, 1, 3, 3, 4, 2, 1, 3, 3, 1, 3, 4, 4, 2,
       3, 1, 3, 1, 2, 2, 1, 3, 2, 2, 4, 3, 3, 2, 2, 1, 1, 2, 1, 4, 3, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [7]:
print ("input shape: ", x_train.shape)
x_train[:2,:]

input shape:  (12138, 99)


array([[2, 4, 4, 4, 3, 3, 3, 1, 3, 3, 2, 2, 1, 1, 3, 1, 4, 3, 3, 3, 4, 3,
        3, 1, 4, 2, 1, 2, 4, 4, 3, 1, 3, 3, 4, 2, 1, 3, 3, 1, 3, 4, 4, 2,
        3, 1, 3, 1, 2, 2, 1, 3, 2, 2, 4, 3, 3, 2, 2, 1, 1, 2, 1, 4, 3, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [3, 3, 2, 2, 2, 4, 3, 3, 3, 2, 1, 3, 2, 2, 3, 4, 4, 3, 1, 1, 2, 4,
        4, 4, 2, 2, 2, 4, 2, 4, 2, 1, 4, 2, 1, 3, 2, 2, 2, 4, 3, 2, 2, 1,
        2, 2, 1, 3, 3, 2, 1, 3, 3, 1, 2, 4, 2, 1, 3, 1, 3, 3, 4, 2, 2, 2,
        2, 4, 4, 2, 2, 4, 2, 4, 3, 4, 2, 2, 2, 2, 4, 3, 2, 4, 3, 2, 1, 3,
        3, 4, 3, 3, 1, 1, 3, 1, 4, 3, 1]], dtype=int32)

### count character frequencies in training set

In [26]:
char_freqs = tf.reduce_mean(tf.one_hot(x_train, depth=5), axis=[0,1]).numpy()
char_freqs = char_freqs[1:]
char_freqs /= np.sum(char_freqs)
print ("frequencies of characters", char_freqs)

frequencies of characters [0.25243348 0.25143555 0.24203248 0.2540985 ]


In [32]:
# From this, a trivial baseline cross-entropy error of a constant estimator can be estimated.
baseline_accuracy = np.sum(- char_freqs * np.log(char_freqs))
print (f"baseline_accuracy = {baseline_accuracy:.4f}")

baseline_accuracy = 1.3861


## Make TF Dataset
Here, we do not need the labels (exon or not). Instead we try to predict the $i$-th character from input sequence up to $i-1$. E.g. the training output 
from "attcac" could be "ttcacg".

In [40]:
def future_sequence(sequence):
    input_seq = sequence[:-1]
    target_seq = sequence[1:]
    target_seq = tf.one_hot(target_seq, depth=5)
    target_seq = target_seq[:,1:]
    return input_seq, target_seq

future_sequence([1,4,4,1,1,2,3])

([1, 4, 4, 1, 1, 2],
 <tf.Tensor: shape=(6, 4), dtype=float32, numpy=
 array([[0., 0., 0., 1.],
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.]], dtype=float32)>)

In [41]:
train_ds = tf.data.Dataset.from_tensor_slices(x_train)
train_ds = train_ds.map(future_sequence)
train_ds = train_ds.shuffle(20000).batch(32, drop_remainder=True)
train_ds = train_ds.prefetch(tf.data.experimental.AUTOTUNE)

for (a, b) in train_ds.take(1):
    print("shape of training example", a.shape, "output", b.shape)
    print("training example", a[0,:10], "output", b[0,:10])

shape of training example (32, 98) output (32, 98, 4)
training example tf.Tensor([3 3 1 4 4 3 1 4 4 4], shape=(10,), dtype=int32) output tf.Tensor(
[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]], shape=(10, 4), dtype=float32)


## Prepare Model

In [46]:
units=64
rnn_simple = tf.keras.Sequential()
rnn_simple.add(layers.Embedding(input_dim=s+1, output_dim=4, mask_zero=True))
rnn_simple.add(layers.LSTM(units, return_sequences=True))
# SimpleRNN outputs the hidden states directly.
# Here, we want to transform them with a Dense layer with a units x s parameter matrix.
# The following tf.keras.layers.Dense layer is applied to all positions and produces 
# a sequence output.
rnn_simple.add(layers.Dense(4, activation='softmax'))
rnn_simple.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 4)           20        
_________________________________________________________________
lstm (LSTM)                  (None, None, 64)          17664     
_________________________________________________________________
dense_2 (Dense)              (None, None, 4)           260       
Total params: 17,944
Trainable params: 17,944
Non-trainable params: 0
_________________________________________________________________


In [47]:
out = rnn_simple(x_train[0:10,:])
out.shape

TensorShape([10, 99, 4])

# Training

In [48]:
rnn_simple.compile(optimizer='adam', loss='categorical_crossentropy')

In [49]:
history = rnn_simple.fit(train_ds, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
