# Testing UniRef Model

In [15]:
import torch
from transformers import T5EncoderModel, T5Tokenizer
import re
import numpy as np

# Load model
tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False )

In [16]:
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50")

Some weights of the model checkpoint at Rostlab/prot_t5_xl_uniref50 were not used when initializing T5EncoderModel: ['decoder.block.2.layer.0.layer_norm.weight', 'decoder.block.15.layer.1.EncDecAttention.o.weight', 'decoder.block.7.layer.0.SelfAttention.q.weight', 'decoder.block.15.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.1.layer_norm.weight', 'decoder.block.13.layer.2.DenseReluDense.wi.weight', 'decoder.block.16.layer.0.SelfAttention.q.weight', 'decoder.block.18.layer.1.EncDecAttention.v.weight', 'decoder.block.16.layer.1.layer_norm.weight', 'decoder.block.13.layer.1.EncDecAttention.q.weight', 'decoder.block.10.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.layer_norm.weight', 'decoder.block.4.layer.0.SelfAttention.k.weight', 'decoder.block.19.layer.2.layer_norm.weight', 'decoder.block.21.layer.1.EncDecAttention.k.weight', 'decoder.block.22.layer.2.layer_norm.weight', 'decoder.block.8.layer.1.layer_norm.weight', 'decoder.block.22.layer.1.EncDecAttention.

In [8]:
# Load into GPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
model = model.eval()

In [54]:
# Tokenize, encode, and load example sequences
ex_seq = ["A E T C Z A O","S K T Z P"]
ex_seq = [re.sub(r"[UZOB]", "X", sequence) for sequence in ex_seq]
ids = tokenizer.batch_encode_plus(ex_seq, add_special_tokens=True, padding=True)
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

In [55]:
# Extract sequence features
with torch.no_grad():
    embedding = model(input_ids=input_ids,attention_mask=attention_mask)
embedding = embedding.last_hidden_state.cpu().numpy()

In [56]:
# Remove padding and special tokens
features = [] 
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embedding[seq_num][:seq_len-1]
    features.append(seq_emd)

In [57]:
features

[array([[ 0.17167206, -0.14080106, -0.20465545, ...,  0.14517064,
          0.1475329 , -0.07327209],
        [ 0.09367212, -0.1169605 , -0.299749  , ...,  0.10004929,
         -0.22623877,  0.2256127 ],
        [ 0.19351344, -0.0951817 , -0.2920989 , ...,  0.0669644 ,
          0.03064488,  0.13165715],
        ...,
        [ 0.49161187,  0.00098973, -0.18922974, ...,  0.0872689 ,
          0.49303597,  0.14975198],
        [ 0.07876214, -0.07448181, -0.19791882, ..., -0.07003238,
          0.09184474, -0.24240708],
        [ 0.45733184,  0.00434612, -0.2867953 , ..., -0.00817765,
          0.287747  , -0.0178025 ]], dtype=float32),
 array([[ 0.16555811, -0.0929713 , -0.2260613 , ..., -0.07201447,
         -0.11815782,  0.15539673],
        [ 0.11265312, -0.12298614, -0.1173238 , ...,  0.05776755,
         -0.30057937,  0.19830096],
        [ 0.30789432, -0.10488601, -0.1604077 , ..., -0.06531572,
          0.10468011, -0.07694177],
        [ 0.3379021 , -0.20987576, -0.28793383, ...,

# Trying Real Protein Sequences

In [66]:
from Bio import SeqIO

# Use biopython to parse fasta file and append sequences to list
sequences = []
with open("sequence.fasta") as file:
    for seq in SeqIO.parse(file, 'fasta'):
        sequences.append(str(seq.seq))
sequences = sequences[0:10]
len(sequences)

10

In [67]:
# Tokenize, encode, pad
ids = tokenizer.batch_encode_plus(sequences, add_special_tokens=True, padding=True)
input_ids = torch.tensor(ids['input_ids']).to(device)
attention_mask = torch.tensor(ids['attention_mask']).to(device)

# Extract sequence features
with torch.no_grad():
    embedding = model(input_ids=input_ids,attention_mask=attention_mask)  #decoder_input_ids for decoder weights
embedding = embedding.last_hidden_state.cpu().numpy()

Not entirely sure what the cell below does, but it reduces array of 2 vectors for each protein sequence down to 1

In [68]:
# Remove padding and special tokens
features = [] 
for seq_num in range(len(embedding)):
    seq_len = (attention_mask[seq_num] == 1).sum()
    seq_emd = embedding[seq_num][:seq_len-1]
    features.append(seq_emd)

In [96]:
# Find cosine similarity between all embedding vectors
cos_matrix = np.zeros((len(features), len(features)))
for i, vec1 in enumerate(features):
    for j, vec2 in enumerate(vec1):
        cos_matrix[i][j] = np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
cos_matrix

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])