In [None]:
%%time
#@title 1. installation
#@markdown
!pip install biopython
!pip install fair-esm
from Bio import SeqIO
import torch
import esm
import pandas as pd
from tqdm import tqdm
import math
from google.colab import files
import os

[Evolutionary Scale Modeling (esm)](https://github.com/facebookresearch/esm?tab=readme-ov-file#pre-trained-models-) is a set of protein language models with varying numbers of layers, parameters, size (in GB), etc. Only those models that fit within the 16GB vRAM of the Tesla T4 GPU are listed in the following table. Choose a model based on your requirements. Bigger model tend to perfomr well, but are slow to compute.

| model |  layers | parameters |  size (GB)|
|------------------------------|----|----------|---|
| `esm2_t36_3B_UR50D`          | 36 | 3B      | 5.3|
| `esm2_t33_650M_UR50D`        | 33 | 650M    | 2.4|
| `esm2_t30_150M_UR50D`        | 30 | 150M    | 0.56|
| `esm2_t12_35M_UR50D`         | 12 | 35M     | 0.12|
| `esm2_t6_8M_UR50D`           | 6  | 8M      | 0.02|
| `esm1b_t33_650M_UR50S`      | 33  | 650M      | 7.3|
| `esm1_t34_670M_UR50S`      | 34 | 670M      | 6.2|
| `esm1_t34_670M_UR50D`         | 34  | 670M      | 6.2|
| `esm1_t12_85M_UR50S`         | 12  | 85M      | 0.97|
| `esm1_t6_43M_UR50S`         | 6  | 43M      | 0.48|


In [None]:
%%time
#@title Choose a model
#@markdown

# Load ESM-2 model
protein_language_model  =  "esm2_t33_650M_UR50D" #@param ["esm2_t36_3B_UR50D","esm2_t33_650M_UR50D","esm2_t30_150M_UR50D","esm2_t12_35M_UR50D","esm2_t6_8M_UR50D","esm1b_t33_650M_UR50S","esm1_t34_670M_UR50S","esm1_t34_670M_UR50D","esm1_t12_85M_UR50S","esm1_t6_43M_UR50S"]

if protein_language_model == "esm2_t36_3B_UR50D":
    model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
elif protein_language_model == "esm2_t33_650M_UR50D":
    model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
elif protein_language_model == "esm2_t30_150M_UR50D":
    model, alphabet = esm.pretrained.esm2_t30_150M_UR50D()
elif protein_language_model == "esm2_t12_35M_UR50D":
    model, alphabet = esm.pretrained.esm2_t12_35M_UR50D()
elif protein_language_model == "esm2_t6_8M_UR50D":
    model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
elif protein_language_model == "esm1b_t33_650M_UR50S":
    model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
elif protein_language_model == "esm1_t34_670M_UR50S":
    model, alphabet = esm.pretrained.esm1_t34_670M_UR50S()
elif protein_language_model == "esm1_t34_670M_UR50D":
    model, alphabet = esm.pretrained.esm1_t34_670M_UR50D()
elif protein_language_model == "esm1_t12_85M_UR50S":
    model, alphabet = esm.pretrained.esm1_t12_85M_UR50S()
elif protein_language_model == "esm1_t6_43M_UR50S":
    model, alphabet = esm.pretrained.esm1_t6_43M_UR50S()

model = model.cuda()  # Mover el modelo a la GPU
batch_converter = alphabet.get_batch_converter()

In [None]:
%%time
#@title ##Upload your sequences (fasta format)

# rename the sequences
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
os.rename(file_name, 'seqs.fasta')
fasta_file = "seqs.fasta"
sequences = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse(fasta_file, "fasta")]

In [None]:
%%time
#@title ## Generate embeddings
# Calcular el número de lotes de 100 secuencias
num_sequences = len(sequences)
num_batches = math.ceil(num_sequences / 100)

# Crear una barra de progreso
pbar = tqdm(total=num_batches, desc="Generating embeddings ...")

# Lista para almacenar todas las representaciones de secuencias
all_sequence_representations = []

# Procesar cada lote de 100 secuencias
for i in range(0, num_sequences, 100):
    # Obtener el lote de secuencias
    batch_data = x[i:i+100]

    # Preparar los datos para el modelo
    batch_labels, batch_strs, batch_tokens = batch_converter(batch_data)
    batch_tokens = batch_tokens.cuda()
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    num_layers = {
    "esm2_t36_3B_UR50D": 36,
    "esm2_t33_650M_UR50D": 33,
    "esm2_t30_150M_UR50D": 30,
    "esm2_t12_35M_UR50D": 12,
    "esm2_t6_8M_UR50D": 6,
    "esm1b_t33_650M_UR50S": 33,
    "esm1_t34_670M_UR50S": 34,
    "esm1_t34_670M_UR50D": 34,
    "esm1_t12_85M_UR50S": 12,
    "esm1_t6_43M_UR50S": 6}

  # Seleccionar el número de capas basado en el modelo seleccionado
    layer = num_layers.get(protein_language_model, None)
    with torch.no_grad():
      results = model(batch_tokens, repr_layers=[layer], return_contacts=False)
    token_representations = results["representations"][layer]

    ############
    # Extraer representaciones por token
    #with torch.no_grad():
    #    results = model(batch_tokens, repr_layers=[33], return_contacts=False)
    #token_representations = results["representations"][33]
    ###################

    # Generar representaciones por secuencia mediante promedio
    sequence_representations = []
    for j, tokens_len in enumerate(batch_lens):
        sequence_representations.append(token_representations[j, 1:tokens_len-1].mean(0))

    # Actualizar la lista de representaciones de secuencias
    all_sequence_representations.extend(sequence_representations)

    # Actualizar la barra de progreso
    pbar.update(1)

# Cerrar la barra de progreso
pbar.close()

In [None]:
#@title ## Download results
df = pd.DataFrame(sequences, columns=['protein', 'sequence'])
df["embeddings"] = [embedding.cpu().numpy() for embedding in all_sequence_representations]
df.to_pickle("embeddings.pkl")
files.download('embeddings.pkl')