In [1]:
import torch
import sentencepiece
from transformers import T5EncoderModel, T5Tokenizer

In [2]:
import re

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [4]:
tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50")
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
def get_per_protein_embedding(sequence):
    # this will replace all rare/ambiguous amino acids by X and introduce white-space between all amino acids
    sequence = " ".join(list(re.sub(r"[UZOB]", "X", sequence)))

    # tokenize sequences and pad up to the longest sequence in the batch
    ids = tokenizer.batch_encode_plus(sequence, add_special_tokens=True, padding="longest")
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)

    # generate embeddings
    with torch.no_grad():
        embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)

    # extract embeddings for the first ([0,:]) sequence in the batch while removing padded & special tokens ([0,:7]) 
    emb_0 = embedding_repr.last_hidden_state[0,:7] # shape (7 x 1024)

    # if you want to derive a single representation (per-protein embedding) for the whole protein
    emb_0_per_protein = emb_0.mean(dim=0) # shape (1024)
    return emb_0_per_protein

In [7]:
q = get_per_protein_embedding("MAGNIFICENT")
print(q)
print(q.shape)

tensor([ 0.0357, -0.1899, -0.1827,  ...,  0.0174, -0.2099, -0.0339])
torch.Size([1024])


In [11]:
import json
import torch
from torch import Tensor
from typing import Dict

# Load the records from the JSON file
with open('data/records.json', 'r') as f:
    records = json.load(f)

# Create an empty tensor dictionary to store the embeddings
embedding_dict: Dict[str, Tensor] = {}

# Loop through the records
for record in records:
    sequence = record['sequence']
    record_id = record['id']
    
    # Get the protein embedding for the sequence
    embedding = get_per_protein_embedding(sequence)
    
    # Save the embedding with its ID in the tensor dictionary
    embedding_dict[record_id] = embedding

# Save the tensor dictionary to a file
torch.save(embedding_dict, 'embeddings.pt')

KeyboardInterrupt: 