In [1]:
%%time
#@title 1. installation
#@markdown
!pip install biopython
!pip install fair-esm
from Bio import SeqIO
import torch
import esm
import pandas as pd
from tqdm import tqdm
import math
from google.colab import files
import os

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83
Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fair-esm
Successfully installed fair-esm-2.0.0
CPU times: user 1.82 s, sys: 337 ms, total: 2.16 s
Wall time: 27.1 s


[Evolutionary Scale Modeling (esm)](https://github.com/facebookresearch/esm?tab=readme-ov-file#pre-trained-models-) is a set of protein language models with varying numbers of layers, parameters, size (in GB), etc. Only those models that fit within the 16GB vRAM of the Tesla T4 GPU are listed in the following table. Choose a model based on your requirements. Bigger model tend to perfomr well, but are slow to compute.

| model |  layers | parameters |  size (GB)|
|------------------------------|----|----------|---|
| `esm2_t36_3B_UR50D`          | 36 | 3B      | 5.3|
| `esm2_t33_650M_UR50D`        | 33 | 650M    | 2.4|
| `esm2_t30_150M_UR50D`        | 30 | 150M    | 0.56|
| `esm2_t12_35M_UR50D`         | 12 | 35M     | 0.12|
| `esm2_t6_8M_UR50D`           | 6  | 8M      | 0.02|
| `esm1b_t33_650M_UR50S`      | 33  | 650M      | 7.3|
| `esm1_t34_670M_UR50S`      | 34 | 670M      | 6.2|
| `esm1_t34_670M_UR50D`         | 34  | 670M      | 6.2|
| `esm1_t12_85M_UR50S`         | 12  | 85M      | 0.97|
| `esm1_t6_43M_UR50S`         | 6  | 43M      | 0.48|


In [2]:
%%time
#@title Choose a model
#@markdown

# Load ESM-2 model
protein_language_model  =  "esm2_t33_650M_UR50D" #@param ["esm2_t36_3B_UR50D","esm2_t33_650M_UR50D","esm2_t30_150M_UR50D","esm2_t12_35M_UR50D","esm2_t6_8M_UR50D","esm1b_t33_650M_UR50S","esm1_t34_670M_UR50S","esm1_t34_670M_UR50D","esm1_t12_85M_UR50S","esm1_t6_43M_UR50S"]

if protein_language_model == "esm2_t36_3B_UR50D":
    model, alphabet = esm.pretrained.esm2_t36_3B_UR50D()
elif protein_language_model == "esm2_t33_650M_UR50D":
    model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
elif protein_language_model == "esm2_t30_150M_UR50D":
    model, alphabet = esm.pretrained.esm2_t30_150M_UR50D()
elif protein_language_model == "esm2_t12_35M_UR50D":
    model, alphabet = esm.pretrained.esm2_t12_35M_UR50D()
elif protein_language_model == "esm2_t6_8M_UR50D":
    model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
elif protein_language_model == "esm1b_t33_650M_UR50S":
    model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
elif protein_language_model == "esm1_t34_670M_UR50S":
    model, alphabet = esm.pretrained.esm1_t34_670M_UR50S()
elif protein_language_model == "esm1_t34_670M_UR50D":
    model, alphabet = esm.pretrained.esm1_t34_670M_UR50D()
elif protein_language_model == "esm1_t12_85M_UR50S":
    model, alphabet = esm.pretrained.esm1_t12_85M_UR50S()
elif protein_language_model == "esm1_t6_43M_UR50S":
    model, alphabet = esm.pretrained.esm1_t6_43M_UR50S()

model = model.cuda()  # Mover el modelo a la GPU
batch_converter = alphabet.get_batch_converter()

Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt


CPU times: user 10.9 s, sys: 8.37 s, total: 19.3 s
Wall time: 31.4 s


In [3]:
%%time
#@title ##Upload your sequences (fasta format)

# rename the sequences
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
os.rename(file_name, 'seqs.fasta')
fasta_file = "seqs.fasta"
sequences = [(seq_record.id, str(seq_record.seq)) for seq_record in SeqIO.parse(fasta_file, "fasta")]

Saving seqs.fasta to seqs.fasta
CPU times: user 59.1 ms, sys: 13.6 ms, total: 72.7 ms
Wall time: 7.47 s


In [5]:
%%time
#@title ## Generate embeddings
# Calcular el número de lotes de 100 secuencias
num_sequences = len(sequences)
num_batches = math.ceil(num_sequences / 100)

# Crear una barra de progreso
pbar = tqdm(total=num_batches, desc="Generating embeddings")

# Lista para almacenar todas las representaciones de secuencias
all_sequence_representations = []

# Procesar cada lote de 100 secuencias
for i in range(0, num_sequences, 100):
    # Obtener el lote de secuencias
    batch_data = sequences[i:i+100]

    # Preparar los datos para el modelo
    batch_labels, batch_strs, batch_tokens = batch_converter(batch_data)
    batch_tokens = batch_tokens.cuda()
    batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)

    num_layers = {
    "esm2_t36_3B_UR50D": 36,
    "esm2_t33_650M_UR50D": 33,
    "esm2_t30_150M_UR50D": 30,
    "esm2_t12_35M_UR50D": 12,
    "esm2_t6_8M_UR50D": 6,
    "esm1b_t33_650M_UR50S": 33,
    "esm1_t34_670M_UR50S": 34,
    "esm1_t34_670M_UR50D": 34,
    "esm1_t12_85M_UR50S": 12,
    "esm1_t6_43M_UR50S": 6}

  # Seleccionar el número de capas basado en el modelo seleccionado
    layer = num_layers.get(protein_language_model, None)
    with torch.no_grad():
      results = model(batch_tokens, repr_layers=[layer], return_contacts=False)
    token_representations = results["representations"][layer]

    ############
    # Extraer representaciones por token
    #with torch.no_grad():
    #    results = model(batch_tokens, repr_layers=[33], return_contacts=False)
    #token_representations = results["representations"][33]
    ###################

    # Generar representaciones por secuencia mediante promedio
    sequence_representations = []
    for j, tokens_len in enumerate(batch_lens):
        sequence_representations.append(token_representations[j, 1:tokens_len-1].mean(0))

    # Actualizar la lista de representaciones de secuencias
    all_sequence_representations.extend(sequence_representations)

    # Actualizar la barra de progreso
    pbar.update(1)

# Cerrar la barra de progreso
pbar.close()


Generating embeddings ...:   0%|          | 0/1 [00:22<?, ?it/s]

Generating embeddings ...: 100%|██████████| 1/1 [00:03<00:00,  3.00s/it]

CPU times: user 2.07 s, sys: 369 ms, total: 2.43 s
Wall time: 3.01 s





In [9]:
#@title ## Download results
df = pd.DataFrame(sequences, columns=['protein', 'sequence'])
df["embeddings"] = [embedding.cpu().numpy() for embedding in all_sequence_representations]
df.to_pickle("embeddings.pkl")
files.download('embeddings.pkl')
print("Your results looks like this. Load it with pandas using read_pickle")
df

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Your results looks like this. Load it with pandas read_pickle


Unnamed: 0,protein,sequence,embeddings
0,A0A009H0S0|PF00144(35...386)|Beta-lactamase-re...,MIKEILLADTHNYHGILDERFIDLAHQFSRLQDARTGQGGAALAVY...,"[0.015497196, -0.03650023, 0.0043984624, -0.00..."
1,A0A009H9I3|PF00144(36...378)|Beta-lactamase,MRFKKISCLLLPPLFIFSTSIYAGNTPKDREIKKLVDQNFKPLLDK...,"[0.020373486, -0.042232603, -0.07505516, 0.037..."
2,A0A009HSX7|PF00144(19...359)|Beta-lactamase-re...,MSEQQVQKIWKSIESLYKTGNYPLITFCLRRQGKILLNRSIGYAQG...,"[0.0051254537, 0.0006607542, 0.008669225, -0.0..."
3,A0A009I2X1|PF00144(35...386)|Beta-lactamase-re...,MIKEILLADTHNYHGILDERFIDLAHQFSRLQDARTGQGGAALAVY...,"[0.01897145, -0.046806958, -0.00091308745, 0.0..."
4,A0A009I909|PF00144(53...390)|Beta-lactamase-re...,MKIFSTNTCPVPDNIEQVIRQKDEVAAEQGGMSDHQIQKIWKSIEG...,"[0.0039020162, -0.015960833, 0.0074622002, 0.0..."
5,A0A009I9X2|PF00144(36...379)|Beta-lactamase,MRFKKISYLLLPSLFIFNTSIYAGNTSKDQEIKQLIDQNFKPLLEK...,"[0.031339962, -0.046714075, -0.07487045, 0.026..."
6,A0A009MI74|PF00144(36...380)|Beta-lactamase,MRFKKISCLLLPPLFIFSTSIYAGNTPKEQEVKKLVDQNFKPLLDK...,"[0.025961297, -0.046581198, -0.082172334, 0.03..."
7,A0A009NBF9|PF00144(35...386)|Beta-lactamase-re...,MIKEILLADTHNYHGILDERFIDLAHQFSRLQDARTGQGGAALAVY...,"[0.0074801487, -0.064207666, -0.030611333, 0.0..."
