## Using esm1b (language model) embeddings as input for classifier 
## github: https://github.com/facebookresearch/esm

In [1]:
import esm # this is a module already in the SE3 kernel on digs, but the github link has the information to pip install esm 


In [2]:
import numpy as np
import pandas as pd
import pickle

In [3]:
#load the test data 
with open('/home/achazing/other/classes/data_science_class/ThermoDrift/data/test_tuple_data.pkl','rb') as file: 
    test_tuple_data = pickle.load(file)

In [4]:
import torch 

# Load ESM-1b model
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()

# model.to(device=device)
# alphabet.to(device=device)

batch_converter = alphabet.get_batch_converter() 


model.eval()  # inference mode 


ProteinBertModel(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj):

In [5]:
torch.cuda.is_available()

False

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cpu



## single sequence embedding

In [7]:
# input: SINGLE sequence, LM_model (language model), lm_converter (data processor for the language model)
def get_ESM1b_embedding(input_sequence, LM_model = model, lm_converter = batch_converter ):
    data_ = [("prot", input_sequence.upper())] # process the data
    label, _, seq = lm_converter(data_)
    #print('thing1',thing1,'thing2',thing2,'seq',seq)
    print(seq.shape)
    B, L = seq.shape
    L = L - 2 # remove start and end token
    LM_model.eval()
    with torch.no_grad():
        output = LM_model(seq, repr_layers=[33], return_contacts=True) # get the output from the language model
        embedding = output['representations'][33][:,1:-1,:] # embedding size (1, L, 1280)
        attention_map = output['attentions'][:,:,:,1:-1,1:-1] # attention map size (1, 33, 20, L, L)
        attention_map = attention_map.reshape(B, 33*20, L, L).permute(0,2,3,1) # (1, L, L, 660)
        return embedding,attention_map

In [8]:
example_sequence = 'RQLALEAKGETPSAVTRLSVVAKSEPQDEQSRSQSPRRIILSRLKAGEVDLLEEELGHLTTLTDVVKGADSLSAILPGDIAEDDITAVLCFVIEADQITFETVEVSPKISTPPVLKLAAEQAPTGRVEREKTTR'



In [9]:
example_embedding, example_attention_map = get_ESM1b_embedding(example_sequence)

torch.Size([1, 136])


In [10]:
example_embedding.shape, example_attention_map.shape

(torch.Size([1, 134, 1280]), torch.Size([1, 134, 134, 660]))

In [11]:
test = torch.rand(1,1280)
torch.cat((test,test),dim=0).shape

torch.Size([2, 1280])

# getting embeddings of a batch of sequences

In [12]:
# the batch of sequences has to be formatted in this manner: 
# batch_sequences = [(protein_name_1,protein_sequence_1),(protein_name_2,protein_sequence_2),(protein_name_3,protein_sequence_3)]
train_data = np.load('/home/achazing/other/classes/data_science_class/ThermoDrift/data/train_tuple_data.pkl', allow_pickle=True)
train_data = np.array(train_data)

#split training data into batches of size ___
train_data_batches = np.split(train_data, 4)
batch_sequences = train_data_batches[0]


In [13]:
batch_sequences = [('prot_1','RQLALEAKGETPSAVTRLSVVAKSEPQDEQSRSQSPRRIIL'),
                 ('prot_2','SAILPGDIAEDDITAVLCFVIEADQITFETVEVSPKISTPPVLKLAAEQAPTGRVEREKTTR'),
                 ('prot_3','SQSPRRIILSRLKAGEVDLLEEELGHLTTLTDVVKGADSLSAIL')]

In [14]:
# input: batch sequence, LM_model (language model), lm_converter (data processor for the language model)
def get_ESM1b_embedding_batch(sequence_batch, LM_model = model, lm_converter = batch_converter, average = False ):
    labels, _, seq = lm_converter(sequence_batch)
    #print(len(seq[1]))
    #print(labels)
    B, L = seq.shape
    L = L - 2 # remove start and end token
    LM_model.eval()
    with torch.no_grad():
        output = LM_model(seq, repr_layers=[33], return_contacts=True) # get the output from the language model
        embedding = output['representations'][33][:,1:-1,:] # embedding size (1, L, 1280)
        attention_map = output['attentions'][:,:,:,1:-1,1:-1] # attention map size (1, 33, 20, L, L)
        attention_map = attention_map.reshape(B, 33*20, L, L).permute(0,2,3,1) # (1, L, L, 660)
        
        # if you wanna average the embeddings along the sequence dimension -- i think this could be really cool too
        if (average): 
            embedding = embedding.mean(1)
            
        return embedding,attention_map
    
    

In [15]:
batch_embedding, batch_attention_map = get_ESM1b_embedding_batch(batch_sequences)
batch_embedding_average, batch_attention_map = get_ESM1b_embedding_batch(batch_sequences, average=True)

In [16]:
batch_embedding.shape, batch_attention_map.shape, batch_embedding_average.shape

(torch.Size([3, 62, 1280]),
 torch.Size([3, 62, 62, 660]),
 torch.Size([3, 1280]))

## so now when you are loading in the sequences for the model 
## get them in the batches format, where input_data = [(name1, sequence1),(name2,sequence2),(name3,sequence3)]
## pass that into the get_ESM1b_embedding_batch and get back the embedding
## I really like the idea of averaging the embedding along the sequence dimension -- that way you would have number_of_sequence x embedding_dimension vector (2D instead of 3D)
## not sure how to exactly use attention maps -- but y'all can figure that out 


In [23]:
#get embeddings for all the pickle data 
test_data_array = np.array([]).reshape(0,1280)
for i in range(0,len(test_tuple_data),3):
    print(f"i is {i}, and i plus 3 is {i+3}")
    test_data_embedding_average, test_data_attention_map = get_ESM1b_embedding_batch(test_tuple_data[i:i+3], average=True)
    test_data_array = np.concatenate((test_data_array, test_data_embedding_average), axis=0)

i is 0, and i plus 3 is 3
i is 3, and i plus 3 is 6
i is 6, and i plus 3 is 9
i is 9, and i plus 3 is 12
i is 12, and i plus 3 is 15
i is 15, and i plus 3 is 18
i is 18, and i plus 3 is 21
i is 21, and i plus 3 is 24
i is 24, and i plus 3 is 27
i is 27, and i plus 3 is 30
i is 30, and i plus 3 is 33
i is 33, and i plus 3 is 36
i is 36, and i plus 3 is 39
i is 39, and i plus 3 is 42


KeyboardInterrupt: 

In [24]:
test_data_array.shape

(39, 1280)