In [2]:
# Import dependencies and check whether GPU is available. { display-mode: "form" }
from transformers import T5EncoderModel, T5Tokenizer
import torch.nn.functional as F
import torch.nn as nn
import torch
import h5py
import time
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Using {}".format(device))

Using cpu


In [2]:
def get_T5_model():
    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc")
    model = model.to(device) # move model to GPU
    model = model.eval() # set model to evaluation model
    tokenizer = T5Tokenizer.from_pretrained('Rostlab/prot_t5_xl_half_uniref50-enc', do_lower_case=False)

    return model, tokenizer

In [1]:
import re

def preprocess_sequences(seqs):
    # this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
    return [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in seqs]


In [3]:
model, tokenizer = get_T5_model()
seqs = ['MLRNLLALRQIAQRTISTTSRRHFENKVPEKQKLFQEDNGMPVHLKGGASDALLYRATMA']

preprocess_sequences(seqs)
batch = list()

# add_special_tokens adds extra token at the end of each sequence
token_encoding = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
input_ids      = torch.tensor(token_encoding['input_ids']).to(device)
attention_mask = torch.tensor(token_encoding['attention_mask']).to(device)

with torch.no_grad():
    # returns: ( batch-size x max_seq_len_in_minibatch x embedding_dim )
    embedding_repr = model(input_ids, attention_mask=attention_mask)

print(embedding_repr)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.0041, -0.1437, -0.0393,  ..., -0.0441, -0.0310, -0.0139],
         [-0.0003, -0.0969, -0.0192,  ..., -0.0392, -0.0178,  0.0131]]]), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)


In [None]:
class ProteinEmbed(nn.Module):
    def __init__(self, **kwargs) -> None:
        super(ProteinEmbed, self).__init__()
        self.encoder, self.tokenizer = get_T5_model()
        self.decoder = torch.nn.Transformer(**kwargs)
        self.freeze_encoder()

    def freeze_encoder(self):
        for p in self.encoder.parameters():
            p.requires_grad = False

    def forward(self, input, hidden):
        emb = self.encoder(input)
        output, hidden = self.rnn(emb, hidden)
        decoded = self.decoder(output)
        decoded = decoded.view(-1, self.ntoken)
        return F.log_softmax(decoded, dim=1), hidden

In [None]:
from torch.utils.data import Dataset

class ProteinDataset(Dataset):
    def __init__(self) -> None:
        ... 
    

    def __getitem__(self, index) -> Any:
        ...

    def __len__(self):
        ...