# RNN Classifier

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader
from scripts.ml_helper import CodonDataset, amino_acids, codons, codon_from_output

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

### Creating the Network

In [4]:
class RNN(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, output_size: int, batch_size=1) -> None:
        """
        input_size: Number of features of your input vector
        hidden_size: Number of hidden neurons
        output_size: Number of features of your output vector
        """
        super(RNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size

        self.i2h = nn.Linear(input_size, hidden_size, bias=False)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden_state) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Returns computed output and tanh(i2h + h2h)
        Inputs
        ------
        input: Input vector
        hidden_state: Previous hidden state
        Outputs
        -------
        out: Linear output (without activation because of how pytorch works)
        hidden_state: New hidden state matrix
        """
        input = self.i2h(input)

        hidden_state = self.h2h(hidden_state)

        hidden_state = F.tanh(input + hidden_state)
        out = self.h2o(hidden_state)
        out = self.softmax(out)
        return out, hidden_state

    def init_hidden(self) -> torch.Tensor:
        return torch.zeros(self.batch_size, self.hidden_size, requires_grad=False)


In [5]:
def train(model: RNN, data: DataLoader, epochs: int, optimizer: optim.Optimizer, loss_fn: nn.Module) -> None:
    """
    Trains the model for the specified number of epochs
    Inputs
    ------
    model: RNN model to train
    data: Iterable DataLoader
    epochs: Number of epochs to train the model
    optimizer: Optimizer to use for each epoch
    loss_fn: Function to calculate loss
    """
    train_losses = {}
    model.to(device)

    model.train()
    print("=> Starting training")

    for epoch in range(epochs):
        epoch_losses = list()
        num_data = 0
        for aa_sequence, codon_sequence in data:
            num_data += 1
            if num_data % int(len(data) / 4) == 0:
                percentage = 5 * round(int(num_data / len(data) * 100) / 5)
                print(f'{percentage}%')

            hidden = model.init_hidden()

            # send tensors to device
            aa_sequence, codon_sequence, hidden = aa_sequence.to(device), codon_sequence.to(device), hidden.to(device)
            # clear gradients
            model.zero_grad()
            loss = 0
            for i in range(aa_sequence.shape[1]):
                input = aa_sequence[:, i].reshape(aa_sequence.shape[0], aa_sequence.shape[2])
                out, hidden = model(input, hidden)

                l = loss_fn(out, codon_sequence[:, i].long())
                loss += l
            # Complete gradients
            loss.backward()
            # Adjust learnable parameters
            # clip as well to avoid vanishing and exploding gradients
            nn.utils.clip_grad_norm_(model.parameters(), 3)
            optimizer.step()

            epoch_losses.append(loss.detach().item() / aa_sequence.shape[1])
        train_losses[epoch] = torch.tensor(epoch_losses).mean()
        print(f'=> epoch: {epoch + 1}, loss: {train_losses[epoch]}')

### Training

In [6]:
# Data Prep
organism = "E.Coli"
batch_size = 1
min_length = None
max_length = 600

train_dataset = CodonDataset(organism=organism, split="train", min_length=min_length, max_length=max_length)
test_dataset = CodonDataset(organism=organism, split="test", min_length=min_length, max_length=max_length)

print(len(train_dataset))
print(len(test_dataset))

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

2839
712


In [7]:
    # Model
input_dim = len(amino_acids)
output_dim = len(codons)
n_hidden = 128

rnnModel = RNN(input_size=input_dim, hidden_size=n_hidden, output_size=output_dim, batch_size=batch_size)
print(rnnModel)

# Train variables
epochs = 5
learning_rate = 0.001
loss = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(rnnModel.parameters(), lr=learning_rate)

RNN(
  (i2h): Linear(in_features=22, out_features=128, bias=False)
  (h2h): Linear(in_features=128, out_features=128, bias=True)
  (h2o): Linear(in_features=128, out_features=65, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [8]:
train(rnnModel, train_loader, epochs, optimizer, loss)

=> Starting training
25%
50%
75%
100%
=> epoch: 1, loss: 0.49895545840263367
25%
50%
75%
100%
=> epoch: 2, loss: 0.4632096588611603
25%
50%
75%
100%
=> epoch: 3, loss: 0.46253812313079834
25%
50%
75%
100%
=> epoch: 4, loss: 0.4622006416320801
25%
50%
75%
100%
=> epoch: 5, loss: 0.4619610905647278
