# RNN Classifier

In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from scripts.ml_helper import CodonDataset, amino_acids, codons, codon_from_output

### Data Preparation

In [16]:
n_amino_acids = len(amino_acids)
n_codons = len(codons)

In [6]:
ecoli_dataset = CodonDataset("E.Coli")
generator = torch.Generator().manual_seed(42)
ecoli_train_set, ecoli_val_set = torch.utils.data.random_split(ecoli_dataset, [0.8, 0.2], generator=generator)

In [7]:
print(len(ecoli_train_set))
print(len(ecoli_val_set))

3085
771


In [11]:
dataloader = DataLoader(ecoli_dataset, batch_size=1, shuffle=True, num_workers=0)

for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch, sample_batched)

    # observe 4th batch and stop.
    if i_batch == 3:
        break

0 [tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 1]]]), tensor([[35,  4, 24, 38, 33, 28, 56, 42, 27, 42, 19, 42, 55, 28, 53, 44, 43, 32,
         27, 61, 27, 49, 51, 55, 17, 43, 42, 35, 17, 57, 59, 23, 25, 58, 13, 52,
         54, 48,  2, 26, 27, 32, 52, 52, 33, 28, 61, 55, 50, 41, 60, 19, 35, 31,
         58, 51, 32, 42, 60, 24, 19, 39, 58, 25, 33, 48, 25, 27, 63, 56, 59, 18,
         42, 28, 58, 58, 56, 19, 56, 49, 48, 19, 43, 51, 19, 56,  6,  8, 33, 42,
         10]])]
1 [tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 1]]]), tensor([[35, 28, 58, 26, 33, 42, 27, 56, 33, 56, 19, 32, 59, 32,  2,  0,  8, 19,
         43, 42, 43, 32, 28, 48, 33, 1

### Creating the Network

In [17]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        hidden = F.tanh(self.i2h(input) + self.h2h(hidden))
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)


n_hidden = 128
rnn = RNN(n_amino_acids, n_hidden, n_codons)

In [32]:
aa, c = ecoli_train_set[0]
hidden = rnn.initHidden()
output, next_hidden = rnn(aa, hidden)
output

tensor([[-4.1084, -4.0406, -4.0875,  ..., -4.1056, -4.0902, -4.2256],
        [-4.1635, -4.1685, -4.1552,  ..., -4.3154, -4.0351, -4.1656],
        [-4.0903, -4.0890, -4.0503,  ..., -4.2119, -4.0961, -4.1502],
        ...,
        [-4.2334, -4.1264, -4.0462,  ..., -4.2003, -4.2030, -4.2022],
        [-4.2334, -4.1264, -4.0462,  ..., -4.2003, -4.2030, -4.2022],
        [-4.0909, -4.2354, -4.0652,  ..., -4.2347, -4.1591, -4.1950]],
       grad_fn=<LogSoftmaxBackward0>)

### Training

In [33]:
import random


def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

def randomTrainingExample():
    aa, c = randomChoice(ecoli_train_set)
    return aa, c

In [34]:
criterion = nn.NLLLoss()
learning_rate = 0.005  # If you set this too high, it might explode. If too low, it might not learn


def train(codon_tensor, aa_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(aa_tensor.size()[0]):
        output, hidden = rnn(aa_tensor[i], hidden)

    loss = criterion(output, codon_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item()

In [37]:
import time
import math

n_iters = 100_000
print_every = 5000
plot_every = 1000

# Keep track of losses for plotting
current_loss = 0
all_losses = []


def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


start = time.time()

for iter in range(1, n_iters + 1):
    aa, c = randomTrainingExample()
    output, loss = train(c, aa)
    current_loss += loss

    # Print ``iter`` number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = codon_from_output(output)
        correct = '✓' if guess == c else '✗ (%s)' % c
        print(
            '%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss    , guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

ValueError: Expected input batch_size (1) to match target batch_size (404).