In [26]:
import numpy as np
import pandas as pd
from Bio import Entrez
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation

import time
import random
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import sklearn
import skorch
from skorch import NeuralNetClassifier

from skorch.utils import params_for
from torch.autograd import Variable

Takeaways: 

Helper functions

In [2]:
def get_list_of_codons(dna_seq):
    codons = []
    for i in range(0, len(dna_seq), 3):
        codons.append(dna_seq[i:i+3])
    return codons
assert get_list_of_codons('ATGCCCGGGAAATTTTAG') == ['ATG', 'CCC', 'GGG', 'AAA', 'TTT', 'TAG']

In [3]:
u_aas = set()
u_codons = set()
all_codons = []
initial_states = []
max_seq_len = 0
num_seqs = 0
emissions = {}
for record in SeqIO.parse('GCF_000009045.1_ASM904v1_genomic.gbff', "genbank"):
    for feature in record.features:
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            num_seqs += 1
            protein = feature.qualifiers['translation'][0] + '*#'
            if len(protein) > max_seq_len:
                max_seq_len = len(protein)
            aas = set([aa for aa in protein])
            codon = get_list_of_codons(str(feature.extract(record.seq)) + 'PAD')
            all_codons.append(codon)
            initial_states.append(codon[0])
            u_aas = u_aas.union(aas)
            u_codons = u_codons.union(set(codon))
            for i, cdn in enumerate(codon):
                emissions[cdn] = protein[i]
lu_aas = list(u_aas)
lu_codons = list(u_codons)

In [4]:
def encode_seq(seq_obj, seqtype='dna'):
    encdr = lu_codons
    symbols = get_list_of_codons(seq_obj)
    if seqtype != 'dna':
        encdr = lu_aas
        symbols = [c for c in seq_obj]
    outseq = np.array([encdr.index(s) for s in symbols])
    return outseq

test_aa = 'MENILD'
test_nuc = 'AAAAAAATAAGATAG'
assert encode_seq(test_aa, seqtype='prot')[0] == lu_aas.index(test_aa[0]) and \
       encode_seq(test_aa, seqtype='prot')[-1] == lu_aas.index(test_aa[-1])
assert encode_seq(test_nuc, seqtype='dna')[0] == lu_codons.index(test_nuc[0:3]) and \
       encode_seq(test_nuc, seqtype='dna')[-1] == lu_codons.index(test_nuc[-3:])

def decode_seq(num_array, seqtype='dna'):
    encdr = lu_codons
    if seqtype != 'dna':
        encdr = lu_aas
    outseq = [encdr[s] for s in num_array]
    return ''.join(outseq)

assert decode_seq(encode_seq(test_nuc)) == test_nuc
assert decode_seq(encode_seq(test_aa, seqtype='prot'), seqtype='prot') == test_aa

In [5]:
X_train = torch.zeros(max_seq_len, num_seqs, dtype=torch.long)
Y_train = torch.zeros(max_seq_len, num_seqs, dtype=torch.long)
cnt = 0
for record in SeqIO.parse('GCF_000009045.1_ASM904v1_genomic.gbff', "genbank"):
    for feature in record.features:
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            protein = feature.qualifiers['translation'][0] + '*'
            missing_length = max_seq_len - len(protein)
            protein = protein + ''.join(['#' for i in range(missing_length)])
            prot_encode = torch.LongTensor(encode_seq(protein, seqtype='prot'))
            
            dna = str(feature.extract(record.seq))
            dna = dna + ''.join(['PAD' for i in range(missing_length)])            
            dna_encode = torch.LongTensor(encode_seq(dna, seqtype='dna'))
            
            X_train[:, cnt] = prot_encode
            Y_train[:, cnt] = dna_encode
            cnt += 1

In [16]:
batch_size = 10
train_data = TensorDataset(X_train, Y_train)
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)

In [48]:
X_train.shape, Y_train.shape

(torch.Size([5490, 4237]), torch.Size([5490, 4237]))

Build model

In [40]:
class codonGRU(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, bdir=True):
        super(codonGRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.n_directions = 2 if bdir else 1
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=n_layers, bidirectional=bdir)
        self.lin_out = nn.Linear(hidden_size*self.n_directions, hidden_size)
        self.sigmoid = nn.Softmax(dim=2)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input)

        # GRU input dimensions: seq_len, batch, num_features
        output = embedded.view(embedded.shape[0], 1, -1)
        output, hidden = self.gru(output, hidden)

        # GRU output dimensions: seq_len, batch, num_directions * hidden_size
        # and hidden dimensions: num_layers * num_directions, batch, hidden_size
        output = self.sigmoid(self.lin_out(output))
        return output, hidden

    def init_hidden(self, batch_size=1):
        return torch.zeros(self.n_layers*self.n_directions, batch_size, self.hidden_size)

In [41]:
# QC ... checking tensor dimensions and information flow
cgru = codonGRU(len(lu_aas), len(lu_codons), n_layers=2)
x = X_train[:, 0]

o, h = cgru.forward(x, cgru.init_hidden())

In [11]:
o.shape, X_train.shape

(torch.Size([5490, 1, 65]), torch.Size([5490, 4237]))

In [84]:
def train(train_loader, 
          learn_rate=0.02, 
          input_dim=len(lu_aas), 
          hidden_dim=len(lu_codons), 
          EPOCHS=5):
    
    # Instantiating the model
    model = codonGRU(input_dim, hidden_dim, n_layers=1)
    
    # Defining loss function and optimizer
    criterion = nn.NLLLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate)
    batch_size = 1
    
    model.train()
    print("Starting Training")
    epoch_times = []
    # Start training loop
    for epoch in range(1,EPOCHS+1):
        start_time = time.time()
        h = model.init_hidden(batch_size)
        avg_loss = 0.
        counter = 0
        for i in range(epoch, epoch+10):
            x = X_train[:, i]
            y = Y_train[:, i]
            
            counter += 1
            h = h.data
            model.zero_grad()
            
            out, h = model(x, h)
            out = out.view(y.shape[0], -1)

            loss = criterion(out, y)
            loss.backward()
            optimizer.step()
            avg_loss += loss.item()
            
        current_time = time.time()
        print(f"Epoch {epoch}/{EPOCHS} Done, Total Loss: {avg_loss/len(train_loader)}")
        print(f"Total Time Elapsed: {str(current_time-start_time)} seconds")
        epoch_times.append(current_time-start_time)
    print(f"Total Training Time: {str(sum(epoch_times))} seconds")
    return model

# def evaluate(model, test_x, test_y, label_scalers):
#     model.eval()
#     outputs = []
#     targets = []
#     start_time = time.clock()
#     for i in test_x.keys():
#         inp = torch.from_numpy(np.array(test_x[i]))
#         labs = torch.from_numpy(np.array(test_y[i]))
#         h = model.init_hidden(inp.shape[0])
#         out, h = model(inp.to(device).float(), h)
#         outputs.append(label_scalers[i].inverse_transform(out.cpu().detach().numpy()).reshape(-1))
#         targets.append(label_scalers[i].inverse_transform(labs.numpy()).reshape(-1))
#     print("Evaluation Time: {}".format(str(time.clock()-start_time)))
#     sMAPE = 0
#     for i in range(len(outputs)):
#         sMAPE += np.mean(abs(outputs[i]-targets[i])/(targets[i]+outputs[i])/2)/len(outputs)
#     print("sMAPE: {}%".format(sMAPE*100))
#     return outputs, targets, sMAPE

In [85]:
gru_model = train(train_loader, learn_rate = 0.05, EPOCHS=10)

Starting Training
Epoch 1/10 Done, Total Loss: -0.01464368104391845
Total Time Elapsed: 53.258097887039185 seconds
Epoch 2/10 Done, Total Loss: -0.01722062882612746
Total Time Elapsed: 54.27755665779114 seconds
Epoch 3/10 Done, Total Loss: -0.017212438670229608
Total Time Elapsed: 54.48648500442505 seconds
Epoch 4/10 Done, Total Loss: -0.017304462704719307
Total Time Elapsed: 54.45605516433716 seconds
Epoch 5/10 Done, Total Loss: -0.017316712924908637
Total Time Elapsed: 54.482171297073364 seconds
Epoch 6/10 Done, Total Loss: -0.017383593244847922
Total Time Elapsed: 54.19365048408508 seconds
Epoch 7/10 Done, Total Loss: -0.017572209266148414
Total Time Elapsed: 54.20985746383667 seconds
Epoch 8/10 Done, Total Loss: -0.017616719278915765
Total Time Elapsed: 54.61381912231445 seconds
Epoch 9/10 Done, Total Loss: -0.01759881476018381
Total Time Elapsed: 54.07023644447327 seconds
Epoch 10/10 Done, Total Loss: -0.017668927410695506
Total Time Elapsed: 54.415030002593994 seconds
Total Train

Testing

In [62]:
protein = ''
dna = ''
for record in SeqIO.parse('GCF_000009045.1_ASM904v1_genomic.gbff', "genbank"):
    for feature in record.features:
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            protein = Seq(feature.qualifiers['translation'][0] + '*')
            dna = feature.extract(record.seq)
            break

In [63]:
protein

Seq('MENILDLWNQALAQIEKKLSKPSFETWMKSTKAHSLQGDTLTITAPNEFARDWL...LK*')

In [86]:
prot_e = encode_seq(protein, seqtype='prot')
protein = protein + ''.join(['#' for i in range(missing_length)])
prot_encode = torch.LongTensor(encode_seq(protein, seqtype='prot')).reshape(len(protein), 1, -1)
prot_encode

tensor([[[19]],

        [[ 5]],

        [[ 9]],

        ...,

        [[12]],

        [[12]],

        [[12]]])

In [87]:
codon_probs, _ = cgru.forward(prot_encode, cgru.init_hidden())
codon_probs
# mle_dna = decode_seq(mle_dna_indices)

tensor([[[0.0148, 0.0137, 0.0158,  ..., 0.0156, 0.0161, 0.0181]],

        [[0.0144, 0.0127, 0.0147,  ..., 0.0159, 0.0153, 0.0169]],

        [[0.0132, 0.0115, 0.0152,  ..., 0.0152, 0.0139, 0.0162]],

        ...,

        [[0.0146, 0.0099, 0.0262,  ..., 0.0175, 0.0204, 0.0150]],

        [[0.0144, 0.0104, 0.0254,  ..., 0.0178, 0.0200, 0.0151]],

        [[0.0142, 0.0112, 0.0235,  ..., 0.0178, 0.0196, 0.0154]]],
       grad_fn=<SoftmaxBackward>)

In [88]:
codon_probs[:,0,:].argmax(axis=1).shape

torch.Size([16782])

In [89]:
def decode_probs(tensor_probs):
    indices = tensor_probs[:,0,:].argmax(axis=1)
    seq = decode_seq(indices)
    print(seq)
decode_probs(codon_probs)

TCGGATCATCATCATGATCTTCCCCCCCCGCCGGATCTTCTTGATGATGATGATAAGAAGAAGAGGAGGGTCCATCATCATGATGATGATGATGATGATCATCATCATAAGAGGAGGAGGCTTCTTGATGATGATGATGATGATCCTATTTAGGATCATGATGATAAGGATGATGATCATGATGATGATGATCTTCTTGATGATGATCTTCTTCTTGATCTTAAGGATGATGATGATGTAGTAGTAAGGAGGAGGAGGAGGAGGTGTTCGAGGGATAGGTGTTGTAGGAATAGGAGGAGGAGGAGGAGGAGGTGTTGTGTAAGGCATCATGATCCCGATGATGATTGTTGTTGTAGGTGTGATGTAAGGAAGAGGAAAAAACCTCCTCCTCCTCCTCCTGTCGTCGTCGTCGTCTCTTAATAAGATGATGATAGCGATGATGATGATGATAGGAGGAGGAGGAGGAGGATTCATCATCATCATAAGATTTCGTCGTCGGATGATGATTCGTCGCCCCCCCCCGATGATGATGATGATCTTACTCCCGATGATGTCGGGCCCCCCCATCATCCCCCCGATAAGCCTCCCCCTCCTCCTCCTCCTAAGAAGAAGAAAAAACCCCGGCCTGATCTTCTTCTTCTTCTTGTAGATGATGATGATAGGAGGGTAGTAAGGAGGAGGTGTTGTTGTCATGGGGCCCATCATCATCATGATGATAAGAAGGATGATGATGTAGTAAAGGATAGGAGGAGGGATGATGATGATGATGATGATGATCGCAAGAAGAAGAAGTAGTAGCATATTGATGATGATCTTCTTCTTCTTGATGATGATGATGATCGCGATGATGATGATAAGGATAAAGTCGTCGTCCCGAGGCTTCTTCTTCTTCATCATGATGATGATGATGATGATCTTCTTCTTCTTCCCCATCATGATGATGATGATATTATTCTTGATAAGGATGATGATGATGATCCCGATGATGATGATGATGATC

In [None]:
Seq(mle_dna).translate()

In [None]:
# from Bio.HMM.Utilities import pretty_print_prediction

# pretty_protein = ''.join([x + '  ' for x in protein])
# pretty_print_prediction(pretty_protein, 
#                         dna, 
#                         mle_dna)

In [None]:
state_probabilities = hmm.predict_proba(prot_e.reshape(-1, 1))
state_probabilities[4,]