In [None]:
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib.colors as clr

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data
from torch.autograd import Variable
import torch.optim as optim

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import TimeDistributed
from keras.layers.core import Dense, Activation, Dropout

Using TensorFlow backend.


# Import Data

In [None]:
from google.colab import files
uploaded = files.upload()

Saving rnn-challenge-data.npz to rnn-challenge-data (2).npz


In [None]:
with np.load('rnn-challenge-data.npz') as fh:
    x_train = fh['data_x']
    y_train = fh['data_y']
    x_val = fh['val_x']
    y_val = fh['val_y']
    x_test = fh['test_x']


print(x_train.shape, x_train.dtype)
print(y_train.shape, y_train.dtype)

print(x_val.shape, x_val.dtype)
print(y_val.shape, y_val.dtype)

print(x_test.shape, x_test.dtype)

(400,) <U400
(400,) int64
(100,) <U1200
(100,) int64
(250,) <U2000


# Preprocess Data

In [None]:
letter_dict = list(dict.fromkeys(x_train[0]))
print(letter_dict)

label_dict = list(dict.fromkeys(y_train))
print(label_dict)


['C', 'T', 'A', 'G']
[2, 0, 4, 3, 1]


In [None]:
# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(x_train[0]))
# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}
print(char2int)

for key in char2int.keys():
  char2int[key] = char2int[key] - 396
print(char2int)

{'C': 396, 'T': 397, 'A': 398, 'G': 399}
{'C': 0, 'T': 1, 'A': 2, 'G': 3}


In [None]:
maxlen = len(max(x_train, key=len))
print("The longest string has {} characters".format(maxlen))

The longest string has 400 characters


Now we can convert our input sequences to sequences of integers instead of characters by mapping them using the dictionaries we created above. This will allow us to one-hot-encode our input sequence subsequently.

In [None]:
x_train_integer = []
x_val_integer = []
x_test_integer= []

for i in range(len(x_train)):
    # Remove last character for input sequence
    x_train_integer.append(x_train[i])
for i in range(len(x_train)):
    x_train_integer[i] = [char2int[character] for character in x_train_integer[i]]


for i in range(len(x_val)):
    # Remove last character for input sequence
    x_val_integer.append(x_val[i])
for i in range(len(x_val)):
    x_val_integer[i] = [char2int[character] for character in x_val_integer[i]]


for i in range(len(x_test)):
    # Remove last character for input sequence
    x_test_integer.append(x_test[i])
for i in range(len(x_test)):
    x_test_integer[i] = [char2int[character] for character in x_test_integer[i]]

print(x_train[0])
print(x_train_integer[0])
print('---------------------------------')
print(x_train[0])
print(x_train_integer[0])
print('---------------------------------')
print(x_train[0])
print(x_train_integer[0])

CTAGCTGAGCTACTGAGCTACAGTTGACTGACCAGTCAGTGCTAGCTACTGACAGTCTGACAGTTGACCTGACTGATGACCAGTCTAGCAGTGCTACTAGCTAGGCTACAGTCAGTTGACCAGTCTGACAGTCAGTCTGACTGACAGTCAGTCTAGGCTATGACCTGACTGATGACCTGACTGACTGACAGTCTGACTGATGACGCTATGACCTGACTAGCTAGCAGTTGACTGACCTGACAGTGCTACTAGCAGTTGACCAGTGCTACAGTCTGATGACTGACCTGACAGTCTAGGCTACAGTTGACCTGACAGTCAGTGCTACTGACAGTCTAGTGACCAGTCAGTCAGTTGACCTGACTAGCAGTTGACGCTATGACCAGTCTGACAGTGCTACTAG
[0, 1, 2, 3, 0, 1, 3, 2, 3, 0, 1, 2, 0, 1, 3, 2, 3, 0, 1, 2, 0, 2, 3, 1, 1, 3, 2, 0, 1, 3, 2, 0, 0, 2, 3, 1, 0, 2, 3, 1, 3, 0, 1, 2, 3, 0, 1, 2, 0, 1, 3, 2, 0, 2, 3, 1, 0, 1, 3, 2, 0, 2, 3, 1, 1, 3, 2, 0, 0, 1, 3, 2, 0, 1, 3, 2, 1, 3, 2, 0, 0, 2, 3, 1, 0, 1, 2, 3, 0, 2, 3, 1, 3, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 3, 0, 1, 2, 0, 2, 3, 1, 0, 2, 3, 1, 1, 3, 2, 0, 0, 2, 3, 1, 0, 1, 3, 2, 0, 2, 3, 1, 0, 2, 3, 1, 0, 1, 3, 2, 0, 1, 3, 2, 0, 2, 3, 1, 0, 2, 3, 1, 0, 1, 2, 3, 3, 0, 1, 2, 1, 3, 2, 0, 0, 1, 3, 2, 0, 1, 3, 2, 1, 3, 2, 0, 0, 1, 3, 2, 0, 1, 3, 2, 0, 1, 3, 2, 0, 2, 3, 1, 0, 1, 3, 2, 0, 1, 3, 2

Before encoding our input sequence into one-hot vectors, we'll define 3 key variables:

-dict_size: The number of unique characters that we have in our text
This will determine the one-hot vector size as each character will have an assigned index in that vector

-seq_len: The length of the sequences that we're feeding into the model
As we standardised the length of all our sentences to be equal to the longest sentences, this value will be the max length - 1 as we removed the last character input as well

-batch_size: The number of sentences that we defined and are going to feed into the model as a batch

In [None]:
dict_size = len(letter_dict)
seq_len = maxlen

def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)

    print(features.shape)
    print(features.dtype)
    
    # Replacing the 0 at the relevant character index with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            features[i][u][sequence[i][u]] = 1
    return features

In [None]:
x_train_integer = one_hot_encode(x_train_integer, dict_size, seq_len, len(x_train))
x_val_integer = one_hot_encode(x_val_integer, dict_size, seq_len, len(x_val))
x_test_integer = one_hot_encode(x_test_integer, dict_size, seq_len, len(x_test))

(400, 400, 4)
float32
(100, 400, 4)
float32
(250, 400, 4)
float32


In [None]:
labels = 5
def y_hot_encode(y):
  y_integer = []
  for i in y:
    l = [0]* labels
    l[i] = 1
    y_integer.append(l)
  return y_integer

In [None]:
y_train_integer = y_hot_encode(y_train)
y_val_integer = y_hot_encode(y_val)

# RNN Model

In [None]:
#Hyper parameters
word_vec_length = 400 # Length of the input vector
char_vec_length = 4 # Length of the character vector
output_labels = 5 # Number of output labels

print(f"The input vector will have the shape {word_vec_length}x{char_vec_length}.")
# Out: The input vector will have the shape 23x30.
hidden_nodes = int(2/3 * (word_vec_length * char_vec_length))
print(f"The number of hidden nodes is {hidden_nodes}.")

The input vector will have the shape 400x4.
The number of hidden nodes is 1066.


In [None]:
# Build the model
print('Build model...')
model = Sequential()
model.add(LSTM(hidden_nodes, return_sequences=False, input_shape=(word_vec_length, char_vec_length)))
model.add(Dropout(0.2))
model.add(Dense(units=output_labels))
model.add(Activation('softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['acc'])

batch_size=100
model.fit(x_train_integer, y_train, batch_size=batch_size, epochs=40, validation_data=(x_val_integer, y_val))

Build model...
Train on 400 samples, validate on 100 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40

In [None]:
loss, accuracy = model.evaluate(x_val_integer, y_val)
print(loss, accuracy)

3.396847128868103 0.28999999165534973
