In [1]:
import os
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

All of this beginning stuff looks familiar. We read in a bunch of proteins as test data.

The purpose of this notebook is to demonstrate how to group proteins into batches. Training models on batches of data tends to make model learning "smoother". That is, the model updates its weights to maximize performance on the entire batch, rather than a single example, thus better approximating the learning trend we are seeking on the whole data set. 

The operations of "padding" and "packing" are not well documented by PyTorch, so I created the following example based on this very helpful blog post by [Sia Xin Yun Suzanna](https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html).

In [2]:
url = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/009/045/GCF_000009045.1_ASM904v1/GCF_000009045.1_ASM904v1_genomic.gbff.gz'
gz_fn = 'GCF_000009045.1_ASM904v1_genomic.gbff.gz'
gbk_fn = gz_fn.replace('.gbff.gz', '.gbk')
if not os.path.isfile(gz_fn):
    print('Beginning file download with urllib2...')
    urllib.request.urlretrieve(url, gz_fn)

if not os.path.isfile(gbk_fn):
    with gzip.open(gz_fn, 'rb') as f_in, open(gbk_fn, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

def get_list_of_codons(dna_seq):
    codons = []
    for i in range(0, len(dna_seq), 3):
        codons.append(dna_seq[i:i+3])
    return codons
assert get_list_of_codons('ATGCCCGGGAAATTTTAG') == ['ATG', 'CCC', 'GGG', 'AAA', 'TTT', 'TAG']

In [3]:
host_proteins = []
host_codons = []

u_aas = set()
u_codons = set()
all_codons = []
initial_states = []
emissions = {}
for record in SeqIO.parse(gbk_fn, "genbank"):
    for feature in record.features:
        if feature.type == 'CDS' and 'translation' in feature.qualifiers:
            protein = feature.qualifiers['translation'][0] + '*'
            host_proteins.append(protein)
            
            aas = set([aa for aa in protein])
            codon = get_list_of_codons(str(feature.extract(record.seq)))
            
            host_codons.append(codon)
            all_codons.append(codon)
            initial_states.append(codon[0])
            u_aas = u_aas.union(aas)
            u_codons = u_codons.union(set(codon))
            for i, cdn in enumerate(codon):
                emissions[cdn] = protein[i]
lu_aas = ['0'] + list(u_aas)
lu_codons = ['PAD'] + list(u_codons)

In [4]:
def encode_seq(seq_obj, seqtype='dna'):
    encdr = lu_codons
    symbols = get_list_of_codons(seq_obj)
    if seqtype != 'dna':
        encdr = lu_aas
        symbols = [c for c in seq_obj]
    outseq = np.array([encdr.index(s) for s in symbols])
    return outseq

test_aa = 'MENILD0'
test_nuc = 'AAAAAAATAAGATAGPAD'
assert encode_seq(test_aa, seqtype='prot')[0] == lu_aas.index(test_aa[0]) and \
       encode_seq(test_aa, seqtype='prot')[-1] == lu_aas.index(test_aa[-1])
assert encode_seq(test_nuc, seqtype='dna')[0] == lu_codons.index(test_nuc[0:3]) and \
       encode_seq(test_nuc, seqtype='dna')[-1] == lu_codons.index(test_nuc[-3:])

def decode_seq(num_array, seqtype='dna'):
    encdr = lu_codons
    if seqtype != 'dna':
        encdr = lu_aas
    outseq = [encdr[s] for s in num_array]
    return ''.join(outseq)

assert decode_seq(encode_seq(test_nuc)) == test_nuc
assert decode_seq(encode_seq(test_aa, seqtype='prot'), seqtype='prot') == test_aa

#### Pytorch padding

Think of the sequences as one hot vectors stacked up, 
so that the dimensions are (seq_len, num_characters)

Given a dictionary of characters of length 300, we'll 
pad and pack 3 sequences together of lengths 25, 22, and 15
The final object will have the length of the longest sequence (25),
a batch size of 3 (the number of sequences) and a dictionary 
size of 300 (the number of possible characters in our sequences)

In [5]:
a = torch.ones(25, 300)
b = torch.ones(22, 300)
c = torch.ones(15, 300)
d = pad_sequence([a, b, c])
d.size()

torch.Size([25, 3, 300])

Now that you understand the gist of padding, here we create a custom `Dataset` object that pads a set of proteins.

In [6]:
# https://pytorch.org/tutorials/beginner/data_loading_tutorial.html
class ProteinsSet(Dataset):
    """Protein data set"""
    
    def __init__(self, list_of_proteins, list_of_codons, codon_list, aa_list):
        self.prot_collection = list_of_proteins
        self.codon_collection = list_of_codons
        self.lu_codons = codon_list
        self.lu_aas = aa_list
        
    def __len__(self):
        return len(self.prot_collection)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        encd_prot = self.__encode__(self.prot_collection[idx], 'prot')
        encd_codn = self.__encode__(self.codon_collection[idx], 'dna')
        return (encd_prot, encd_codn)
    
    def __encode__(self, seq_obj, seqtype):
        encdr = self.lu_codons
        symbols = seq_obj
        if seqtype != 'dna':
            encdr = self.lu_aas
            symbols = [c for c in seq_obj]
        encoded_seq = []
        t = torch.zeros(len(symbols), len(encdr))
        for i, s in enumerate(symbols):
            t[i, encdr.index(s)] = 1
        return t

def pad_collate(batch):
    # turn list of tuples into two lists: 
    # https://stackoverflow.com/a/8081590 
    [xx, yy] = map(list, zip(*batch))
    x_lens = [len(x) for x in xx]
    y_lens = [len(y) for y in yy]
    xx_pad = pad_sequence(xx, batch_first=False, padding_value=0)
    yy_pad = pad_sequence(yy, batch_first=False, padding_value=0)
    return xx_pad, yy_pad, x_lens, y_lens

ps = ProteinsSet(host_proteins, host_codons, lu_codons, lu_aas)
batch_size = 16
data_loader = DataLoader(dataset=ps, 
                         batch_size=batch_size, 
                         shuffle=True, 
                         collate_fn=pad_collate)

In [7]:
for (x_padded, y_padded, x_lengths, y_lengths) in data_loader:
    print('size of X training data', x_padded.shape)
    print('size of Y training data', y_padded.shape)
    print('Max sequence lengths', max(x_lengths))
    print('Max sequence lengths', max(y_lengths))
    break
assert x_padded.shape[0] == max(x_lengths)

size of X training data torch.Size([501, 16, 22])
size of Y training data torch.Size([501, 16, 65])
Max sequence lengths 501
Max sequence lengths 501


Here we load a single batch. 

The first dimension is the length of the longest sequence.
The batch size (second dimension) is 16 sequences. 
The third dimension is the dictionary length (22 amino acids or 65 codons).

The x and y lengths objects are lists of the sequence lengths so that the model knows where the padding begins for each training example. The maximum sequence lengths match the first dimension of the training data, which is what we expect.

#### Packing your padded sequences

Packing is done for computational efficiency. By packing the batch of sequences, the RNN doesn't have to operate on so many uninformative padd characters. There is a very nice visualization and motivation at this [Stack Overflow answer](https://stackoverflow.com/a/56211056). 

In [8]:
embedding_dim = len(lu_aas)
h_dim = len(lu_codons)
n_layers = 2
rnn = nn.GRU(embedding_dim, h_dim, n_layers, batch_first=False)

# Pack for efficiency
x_packed = pack_padded_sequence(x_padded, 
                                x_lengths, 
                                batch_first=False, 
                                enforce_sorted=False)
# Push through RNN
h0 = torch.zeros(n_layers, batch_size, h_dim)
output_packed, h1 = rnn(x_packed, h0)

# Then unpack
output_padded, output_lengths = pad_packed_sequence(output_packed, 
                                                    batch_first=False)

In [9]:
output_padded.size()

torch.Size([501, 16, 65])

In [10]:
assert x_padded.shape[0] == output_padded.shape[0]
assert x_padded.shape[1] == output_padded.shape[1]

Notice that the output object has the same sequence length and batch size (16), and the only difference is the output dimension, which matches the number of codons (65).