In [1]:
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [2]:
states = ['H', 'L']
nucleotides = ['A', 'C', 'G', 'T']

def encode_seq(symbols, seqtype='dna'):
    encdr = nucleotides
    if seqtype != 'dna':
        encdr = states
    outseq = np.array([encdr.index(s) for s in symbols])
    return outseq

test_hl = 'HHHLLL'
test_nuc = 'GGGAAA'
assert encode_seq(test_hl, seqtype='states')[0] == states.index(test_hl[0]) and \
       encode_seq(test_hl, seqtype='states')[-1] == states.index(test_hl[-1])
assert encode_seq(test_nuc, seqtype='dna')[0] == nucleotides.index(test_nuc[0]) and \
       encode_seq(test_nuc, seqtype='dna')[-1] == nucleotides.index(test_nuc[-1])

In [3]:
def decode_seq(num_array, seqtype='dna'):
    encdr = nucleotides
    if seqtype != 'dna':
        encdr = states
    outseq = [encdr[s] for s in num_array]
    return ''.join(outseq)

assert decode_seq(encode_seq(test_nuc)) == test_nuc
assert decode_seq(encode_seq(test_hl, seqtype='hid'), seqtype='hid') == test_hl

In [4]:
training_data_file = 'rnn_toy_training.tsv'
training_df = pd.read_csv(training_data_file, sep='\t')
training_df.head(2)

Unnamed: 0,dna,hidden_state
0,TGGTCGTATTTTGTCGGGGGCAGACCAAAAAACAACGAAACGAATG...,LLLLLLLHHLLLHHLHHHLLHLLHHLLHLLHLHLLLLLHHHHLLLH...
1,GCACGGTGGATGTATCGCTGTGCAAGCAAGCCGGGATACTGCTTGT...,HHHHHHHHHHLHLHHHLLLHLHHLHHHHLLHLHLLLHHLLLLHHLH...


In [5]:
SEQ_LEN = training_df.dna.str.len().max()
NUM_SEQS = training_df.shape[0]
BATCH_SIZE = 1

X_train = torch.zeros(NUM_SEQS, SEQ_LEN, dtype=torch.long)
Y_train = torch.zeros(NUM_SEQS, SEQ_LEN, dtype=torch.long)
for i, row in training_df.iterrows():
    dna = row['dna']
    hid = row['hidden_state']
    dna_encode = torch.LongTensor(encode_seq(dna, seqtype='dna'))
    hid_encode = torch.LongTensor(encode_seq(hid, seqtype='hid'))

    X_train[i, :] = dna_encode
    Y_train[i, :] = hid_encode

In [6]:
X_train.shape, Y_train.shape

(torch.Size([1000, 500]), torch.Size([1000, 500]))

In [7]:
train_data = TensorDataset(X_train[:50,:], Y_train[:50,:])
train_loader = DataLoader(train_data, shuffle=False, batch_size=BATCH_SIZE)

In [8]:
train_features, train_labels = next(iter(train_loader))
train_features.shape, train_labels.shape

(torch.Size([1, 500]), torch.Size([1, 500]))

In [9]:
class GRU(nn.Module):
    def __init__(self, input_size, hidden_size, predict_size, n_layers=1, bdir=False):
        super(GRU, self).__init__()
        self.input_size = input_size
        self.embed_size = input_size
        self.hidden_size = hidden_size
        self.predict_size = predict_size
        self.n_layers = n_layers
        self.n_directions = 2 if bdir else 1
        
        self.embedding = nn.Embedding(input_size, self.embed_size)
        self.gru = nn.GRU(self.embed_size, 
                          hidden_size, 
                          num_layers=n_layers, 
                          bidirectional=bdir)

        self.lin_out = nn.Linear(hidden_size*self.n_directions, predict_size)
        self.sigmoid = nn.LogSoftmax(dim=2)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input)
        # embedding shape: (batch_size, seq_len, hidden_size)
        # transpose so that batch dim is in the 2nd index position
        output = torch.transpose(embedded, 0, 1)
        
        output, hidden = self.gru(output, hidden)
        # output shape: (seq_len, batch_size, n_directions*hidden_size)
        # hidden shape: (n_directions*n_layers, batch_size, hidden_size)
        
        output = self.sigmoid(self.lin_out(output))
        return output, hidden

    def init_hidden(self, batch_size=1):
        return torch.zeros(self.n_layers*self.n_directions, 
                           batch_size, 
                           self.hidden_size)
    
    def input_dims(self):
        print(f'Input dimensions are: (batch_size, seq_len, {self.input_size})')
    
    def output_dims(self):
        print(f'Output dimensions are: (seq_len, batch_size, {self.predict_size})')
    
    def hidden_dims(self):
        dnl = self.n_layers*self.n_directions
        print(f'Hidden dimensions are: ({dnl}, batch_size, {self.hidden_size})')

In [10]:
test_model = GRU(len(nucleotides), 10, len(states))
test_model.input_dims(), test_model.hidden_dims(), test_model.output_dims();

Input dimensions are: (batch_size, seq_len, 4)
Hidden dimensions are: (1, batch_size, 10)
Output dimensions are: (seq_len, batch_size, 2)


In [11]:
def train(train_loader, 
          learn_rate=0.02, 
          input_dim=len(nucleotides), 
          hidden_dim=10,
          output_dim=len(states),
          batch_size=1,
          EPOCHS=5):
    
    # Instantiating the model
    model = GRU(input_dim, hidden_dim, output_dim)
    
    # Defining loss function and optimizer
    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
    
    model.train()
    print("Starting training")
    epoch_times = []
    # Start training loop
    for epoch in range(1, EPOCHS+1):
        start_time = time.time()
        avg_loss = 0
        for sample_x, sample_y in train_loader:
            h = model.init_hidden(batch_size)
            model.zero_grad()
            
            # the heart of the training!
            out, h = model(sample_x, h)
            
            # NLLLoss expects batch first, then class probabilities, then seq_len
            out_T = torch.transpose(out, 0, 1)
            out_T = out_T.transpose(1, 2)

            loss = criterion(out_T, sample_y)
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            
        current_time = time.time()
        print(f"Epoch {epoch}/{EPOCHS} Done, Total Loss: {avg_loss/len(train_loader):.3f}")
        print(f"Total Time Elapsed: {current_time-start_time:.1f} seconds")
        epoch_times.append(current_time-start_time)
    print(f"Total Training Time: {str(sum(epoch_times))} seconds")
    return model

In [None]:
gru_model = train(train_loader, learn_rate = 0.02, EPOCHS=5)

Starting training
Epoch 1/5 Done, Total Loss: 0.672
Total Time Elapsed: 5.1 seconds
Epoch 2/5 Done, Total Loss: 0.668
Total Time Elapsed: 5.1 seconds


In [None]:
def predict(model, dna):
    assert all([x in nucleotides for x in dna])
    assert isinstance(model, GRU)
    dna_encode = torch.LongTensor(encode_seq(dna, seqtype='dna'))
    dna_encode = dna_encode[None, :]
    h = model.init_hidden(1)
    model.zero_grad()
    out, _ = model(dna_encode, h)
    out_state_indices = [int(torch.argmax(x)) for x in out[:,0]]
    out_probs = np.array([torch.exp(x).detach().numpy() for x in out[:,0]])
    state = decode_seq(out_state_indices, 'hid')
    return state, out_probs

test_seq = 'GGGTTT'
test_state, test_probs = predict(gru_model, test_seq)
assert len(test_state) == len(test_seq)
assert all([x in states for x in test_state])

In [None]:
pred_hl, pred_prob_hl = predict(gru_model, training_df.iloc[-1, 0])

def align(seq1, seq2):
    WIDTH = 60
    lines = int(np.ceil(len(seq1) / WIDTH))
    match = ''
    for i, c1 in enumerate(seq1):
        indicator = ' '
        if c1 != seq2[i]:
            indicator = '*'
        match += indicator
    
    for i in range(lines):
        print('Seq1', seq1[i*WIDTH:i*WIDTH+WIDTH])
        print('    ', match[i*WIDTH:i*WIDTH+WIDTH])
        print('Seq2', seq2[i*WIDTH:i*WIDTH+WIDTH])
        print()
align(training_df.iloc[-1, 1], pred_hl)

In [None]:
pred_prob_hl[:4,:]