## Using esm1b (language model) embeddings as input for classifier 
## github: https://github.com/facebookresearch/esm

In [3]:
import esm # this is a module already in the SE3 kernel on digs, but the github link has the information to pip install esm 


In [4]:
import numpy as np
import pandas as pd

In [5]:
import torch 

# Load ESM-1b model
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
batch_converter = alphabet.get_batch_converter() 
model.eval()  # inference mode 

ProteinBertModel(
  (embed_tokens): Embedding(33, 1280, padding_idx=1)
  (layers): ModuleList(
    (0): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
      )
      (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      (fc1): Linear(in_features=1280, out_features=5120, bias=True)
      (fc2): Linear(in_features=5120, out_features=1280, bias=True)
      (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    )
    (1): TransformerLayer(
      (self_attn): MultiheadAttention(
        (k_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
        (q_proj):

## single sequence embedding

In [6]:
# input: SINGLE sequence, LM_model (language model), lm_converter (data processor for the language model)
def get_ESM1b_embedding(input_sequence, LM_model = model, lm_converter = batch_converter ):
    data_ = [("prot", input_sequence.upper())] # process the data
    label, _, seq = lm_converter(data_)
    #print('thing1',thing1,'thing2',thing2,'seq',seq)
    print(seq.shape)
    B, L = seq.shape
    L = L - 2 # remove start and end token
    LM_model.eval()
    with torch.no_grad():
        output = LM_model(seq, repr_layers=[33], return_contacts=True) # get the output from the language model
        embedding = output['representations'][33][:,1:-1,:] # embedding size (1, L, 1280)
        attention_map = output['attentions'][:,:,:,1:-1,1:-1] # attention map size (1, 33, 20, L, L)
        attention_map = attention_map.reshape(B, 33*20, L, L).permute(0,2,3,1) # (1, L, L, 660)
        return embedding,attention_map

In [7]:
example_sequence = 'RQLALEAKGETPSAVTRLSVVAKSEPQDEQSRSQSPRRIILSRLKAGEVDLLEEELGHLTTLTDVVKGADSLSAILPGDIAEDDITAVLCFVIEADQITFETVEVSPKISTPPVLKLAAEQAPTGRVEREKTTR'



In [8]:
example_embedding, example_attention_map = get_ESM1b_embedding(example_sequence)

torch.Size([1, 136])


In [9]:
example_embedding.shape, example_attention_map.shape

(torch.Size([1, 134, 1280]), torch.Size([1, 134, 134, 660]))

# getting embeddings of a batch of sequences

In [1]:
# the batch of sequences has to be formatted in this manner: 
# batch_sequences = [(protein_name_1,protein_sequence_1),(protein_name_2,protein_sequence_2),(protein_name_3,protein_sequence_3)]
train_data = np.load('/home/achazing/other/classes/data_science_class/ThermoDrift/data/train_tuple_data.pkl', allow_pickle=True)
train_data = np.array(train_data)

#split training data into batches of size ___
train_data_batches = np.split(train_data, 4)
batch_sequences = train_data_batches[0]


NameError: name 'np' is not defined

In [11]:
# batch_sequences = [('prot_1','RQLALEAKGETPSAVTRLSVVAKSEPQDEQSRSQSPRRIIL'),
#                  ('prot_2','SAILPGDIAEDDITAVLCFVIEADQITFETVEVSPKISTPPVLKLAAEQAPTGRVEREKTTR'),
#                  ('prot_3','SQSPRRIILSRLKAGEVDLLEEELGHLTTLTDVVKGADSLSAIL')]

In [12]:
# input: batch sequence, LM_model (language model), lm_converter (data processor for the language model)
def get_ESM1b_embedding_batch(sequence_batch, LM_model = model, lm_converter = batch_converter, average = False ):
    labels, _, seq = lm_converter(sequence_batch)
    print(len(seq[1]))
    #print(labels)
    B, L = seq.shape
    L = L - 2 # remove start and end token
    LM_model.eval()
    with torch.no_grad():
        output = LM_model(seq, repr_layers=[33], return_contacts=True) # get the output from the language model
        embedding = output['representations'][33][:,1:-1,:] # embedding size (1, L, 1280)
        attention_map = output['attentions'][:,:,:,1:-1,1:-1] # attention map size (1, 33, 20, L, L)
        attention_map = attention_map.reshape(B, 33*20, L, L).permute(0,2,3,1) # (1, L, L, 660)
        
        # if you wanna average the embeddings along the sequence dimension -- i think this could be really cool too
        if (average): 
            embedding = embedding.mean(1)
            
        return embedding,attention_map
    
    

In [None]:
batch_embedding, batch_attention_map = get_ESM1b_embedding_batch(batch_sequences)
batch_embedding_average, batch_attention_map = get_ESM1b_embedding_batch(batch_sequences, average=True)

502


In [None]:
batch_embedding.shape, batch_attention_map.shape, batch_embedding_average.shape

## so now when you are loading in the sequences for the model 
## get them in the batches format, where input_data = [(name1, sequence1),(name2,sequence2),(name3,sequence3)]
## pass that into the get_ESM1b_embedding_batch and get back the embedding
## I really like the idea of averaging the embedding along the sequence dimension -- that way you would have number_of_sequence x embedding_dimension vector (2D instead of 3D)
## not sure how to exactly use attention maps -- but y'all can figure that out 
