<a href="https://colab.research.google.com/github/pikanaeri/plm-model-comparison/blob/main/extracting-embeddings/EsmC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Template Code for Extracting Averaged and Final Embeddings for ESM C
- Stores the embeddings as .pkl files
- Final embeddings are stored in lists of vectors
- Will need the PHROGs annotation table: https://storage.cloud.google.com/plm-model-comparison/PHROG_index.tsv
- Maximum embedding size 5096, group size 1

# Template Code for Extracting Averaged and Final Embeddings for each Model
- Stores the embeddings as .pkl files
- Final embeddings are stored in lists of vectors
- Will need the PHROGs annotation table: https://storage.cloud.google.com/plm-model-comparison/PHROG_index.tsv

## Setting Up Virtual Machine

In [None]:
#@title Tmux Setup (optional; so that your background processes save)
sudo apt-get update -qq
sudo apt-get install -y tmux
tmux

In [None]:
#@title Downloading Anaconda
#@markdown * If a popup menu appears, chose the "package maintainer's version" option
#@markdown * A menu containing all of Anaconda's terms of service will appear, go through these and select "yes" to download Anaconda to your computer
#@markdown * An older version of Anaconda (2022) will be downloaded, keep this in mind when checking for compatible packages
sudo apt-get update
sudo apt-get install bzip2 libxml2-dev
sudo apt upgrade

sudo apt-get install wget
wget https://repo.anaconda.com/archive/Anaconda3-2022.05-Linux-x86_64.sh
bash Anaconda3-2022.05-Linux-x86_64.sh
rm Anaconda3-2022.05-Linux-x86_64.sh

In [None]:
#@title Allocating Memory to Linux
#@markdown Code from: https://linuxize.com/post/create-a-linux-swap-file/
sudo fallocate -l 15G /swapfile
sudo chmod 600 /swapfile
ls -lh /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile


#@markdown Append "/swapfile swap swap defaults 0 0" to the /etc/fstab file using a text editor of your choice
sudo nano /etc/fstab
/swapfile swap swap defaults 0 0

In [None]:
#@title Installing Github
sudo apt-get install git
sudo apt install python3-pip

python3 --version
pip3 --version

In [None]:
#@title Cloning PHROGs Database; files can also be found under the Fasta Files category at https://phrogs.lmge.uca.fr/
git clone https://github.com/pikanaeri/Extracting-3Di-Embeddings-from-Protein-Sequences.git

In [None]:
#@title Installing Dependencies
pip install transformers
pip3 install torch torchvision torchaudio transformers sentencepiece accelerate --extra-index-url https://download.pytorch.org/whl/cu116

mkdir final_embeddings
mkdir final_average_embeddings
python3

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
#@title Installing ESM as a Python dependency
pip install esm

In [None]:
#@title Importing Dependencies
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig
import torch
import re
import time
import gc
import numpy as np
import pickle as pkl

In [None]:
#@title Creating model and setting device to CPU if available
client = ESMC.from_pretrained("esmc_600m").to("cuda") # or "cpu"

In [None]:
#@title Embedding Families
def embed_family(Prev_dir, File_Name, max_prot=-1, max_sz=-1):
  reader = open(Prev_dir + File_Name + ".faa", "r")
  final_embedding = []
  num_proteins = 0
  gsz = 0
  while True:
    name = reader.readline().strip()
    if name == '':
      break
    sequence = reader.readline().strip()
    sequence = sequence.replace('-', 'X')
    #sequence = sequence.replace('J', 'X') unsure if this is necessary-ESM C may support J AA token
    if max_sz == -1 or len(sequence) <= max_sz:
      num_proteins += 1
      gsz += 1
      if max_prot != -1 and num_proteins >= max_prot:
        break
      protein = ESMProtein(sequence=sequence)
      protein_tensor = client.encode(protein)
      logits_output = client.logits(
        protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
      )
      print(logits_output.embeddings.size())
      print(logits_output.embeddings[0].size())
      final_embedding.append(np.mean(logits_output.embeddings[0].cpu().numpy(), axis=0))
      gc.collect()
      torch.cuda.empty_cache()
  reader.close()
  outp_dir = "final_embeddings/" + File_Name + ".pkl"
  final_embedding = np.array(final_embedding)
  with open(outp_dir, 'wb') as f:
    pkl.dump(final_embedding, f)
  outp_dir = "final_average_embeddings/" + File_Name + "_averaged.pkl"
  av_embedding = np.mean(final_embedding, axis=0)
  with open(outp_dir, 'wb') as f:
    pkl.dump(av_embedding, f)



#Navigate PHROGs directory here
def nav_dir(rg_st=-1, family_cap=-1, prot_cap=-1, sz_cap=-1, File_Name = "PHROG_index.tsv"):
  reader = open(File_Name, "r")
  labels = reader.readline().strip().split("\t")
  cnt = rg_st
  while True:
    line = reader.readline()
    if line == '':
      break
    information = line.strip().split("\t")
    phrog_num = int(information[0].split("phrog_")[1])
    if phrog_num >= rg_st:
      cnt += 1
      print("starting ", str(phrog_num))
      tm_start = time.perf_counter()
      phrog_file = "phrog_" + str(phrog_num)
      embed_family("Extracting-3Di-Embeddings-from-Protein-Sequences/FAA_phrog/", phrog_file, max_prot=prot_cap, max_sz=sz_cap)
      print(str(phrog_num), " extracted!\n")
      print("took ", str((time.perf_counter()-tm_start)/60), " minutes")
      if family_cap != -1 and cnt >= family_cap:
        break
  reader.close()

nav_dir(sz_cap=5096)

SyntaxError: unterminated string literal (detected at line 49) (<ipython-input-1-c88580dd61bd>, line 48)