<a href="https://colab.research.google.com/github/pikanaeri/plm-model-comparison/blob/main/extracting-embeddings/ProstT5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Template Code for Extracting Averaged and Final Embeddings for each Model
- Stores the embeddings as .pkl files
- Final embeddings are stored in lists of vectors
- Will need the PHROGs annotation table: https://storage.cloud.google.com/plm-model-comparison/PHROG_index.tsv

## Setting Up Virtual Machine

In [None]:
#@title Tmux Setup (optional; so that your background processes save)
sudo apt-get update -qq
sudo apt-get install -y tmux
tmux

In [None]:
#@title Downloading Anaconda
#@markdown * If a popup menu appears, chose the "package maintainer's version" option
#@markdown * A menu containing all of Anaconda's terms of service will appear, go through these and select "yes" to download Anaconda to your computer
#@markdown * An older version of Anaconda (2022) will be downloaded, keep this in mind when checking for compatible packages
sudo apt-get update
sudo apt-get install bzip2 libxml2-dev
sudo apt upgrade

sudo apt-get install wget
wget https://repo.anaconda.com/archive/Anaconda3-2022.05-Linux-x86_64.sh
bash Anaconda3-2022.05-Linux-x86_64.sh
rm Anaconda3-2022.05-Linux-x86_64.sh

In [None]:
#@title Allocating Memory to Linux
#@markdown Code from: https://linuxize.com/post/create-a-linux-swap-file/
sudo fallocate -l 15G /swapfile
sudo chmod 600 /swapfile
ls -lh /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile


#@markdown Append "/swapfile swap swap defaults 0 0" to the /etc/fstab file using a text editor of your choice
sudo nano /etc/fstab
/swapfile swap swap defaults 0 0

In [None]:
#@title Installing Github
sudo apt-get install git
sudo apt install python3-pip

python3 --version
pip3 --version

In [None]:
#@title Cloning PHROGs Database; files can also be found under the Fasta Files category at https://phrogs.lmge.uca.fr/
git clone https://github.com/pikanaeri/Extracting-3Di-Embeddings-from-Protein-Sequences.git

In [None]:
#@title Installing Dependencies
pip install transformers
pip3 install torch torchvision torchaudio transformers sentencepiece accelerate --extra-index-url https://download.pytorch.org/whl/cu116

mkdir final_embeddings
mkdir final_average_embeddings
python3

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m31.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.9 MB/s[0m eta [36m0:00:00[0m
Col

#ProstT5 Embedding Extraction
- Maximum Sequence Size 5096

In [None]:
#@title Importing Dependencies
from transformers import T5Tokenizer, T5EncoderModel
import torch
import re
import time
import gc
import numpy as np
import pickle as pkl

In [None]:
#@title Creating model and setting device to CPU if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("Using device: {}".format(device))
transformer_link = "Rostlab/ProstT5"
print("Loading: {}".format(transformer_link))
model = T5EncoderModel.from_pretrained(transformer_link)
model.full() if device=='cpu' else model.half()
model = model.to(device)
model = model.eval()

tokenizer = T5Tokenizer.from_pretrained(transformer_link, do_lower_case=False )

In [None]:
#@title Embedding Families

def embed_family(Prev_dir, File_Name, max_prot=-1, max_gsz=1, max_sz=-1):
  reader = open(Prev_dir + File_Name + ".faa", "r")
  sequence_examples = []
  final_embedding = []
  num_proteins = 0
  gsz = 0
  while True:
    name = reader.readline().strip()
    if name == '':
      break
    sequence = reader.readline().strip()
    sequence = sequence.replace('-', 'X')
    if max_sz == -1 or len(sequence) <= max_sz:
      sequence_examples.append(sequence)
      num_proteins += 1
      gsz += 1
      if max_prot != -1 and num_proteins >= max_prot:
        break
    if gsz >= max_gsz:
      sequence_examples = [" ".join(list(re.sub("[UZOB]", "X", sequence))) for sequence in sequence_examples]
      ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
      input_ids = torch.tensor(ids['input_ids']).to(device)
      attention_mask = torch.tensor(ids['attention_mask']).to(device)
      with torch.no_grad():
          embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
      gc.collect()
      torch.cuda.empty_cache()
      for id in range(gsz):
        emb = embedding_repr.last_hidden_state[id, :len(sequence_examples[id])]
        emb = emb.mean(dim=0)
        emb = emb.cpu().numpy()
        gc.collect()
        torch.cuda.empty_cache()
        final_embedding.append(np.array(emb))
      gsz = 0
      sequence_examples = []

  if len(sequence_examples) > 0:
    sequence_examples = [" ".join(list(re.sub("[UZOB]", "X", sequence))) for sequence in sequence_examples]
    ids = tokenizer.batch_encode_plus(sequence_examples, add_special_tokens=True, padding="longest")
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)
    with torch.no_grad():
        embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
    gc.collect()
    torch.cuda.empty_cache()
    for id in range(gsz):
      emb = embedding_repr.last_hidden_state[id, :len(sequence_examples[id])]
      emb = emb.mean(dim=0)
      emb = emb.cpu().numpy()
      gc.collect()
      torch.cuda.empty_cache()
      final_embedding.append(np.array(emb))
      num_proteins += 1

  reader.close()
  outp_dir = File_Name + ".pkl"
  final_embedding = np.array(final_embedding)

  with open(outp_dir, 'wb') as f:
    pkl.dump(final_embedding, f)
  outp_dir = File_Name + "_averaged.pkl"
  av_embedding = np.mean(final_embedding, axis=0)
  with open(outp_dir, 'wb') as f:
    pkl.dump(av_embedding, f)




def nav_dir(rg_st=-1, family_cap=-1, prot_cap=-1, sz_cap=-1, File_Name = "PHROG_index.tsv"):
  reader = open(File_Name, "r")
  labels = reader.readline().strip().split("\t")
  cnt = rg_st
  while True:
    line = reader.readline()
    if line == '':
      break
    information = line.strip().split("\t")
    phrog_num = int(information[0].split("phrog_")[1])
    if information[6] != 'unknown function' and phrog_num >= rg_st:
      tm_start = time.perf_counter()
      phrog_file = "phrog_" + str(phrog_num)
      embed_family("", phrog_file, max_prot=prot_cap, max_sz=sz_cap)
      cnt += 1
      if family_cap != -1 and cnt >= family_cap:
        break
      print(cnt, " extracted!")
      print("took ", str((time.perf_counter()-tm_start)/60), " minutes")
  reader.close()


nav_dir()


Using device: cuda:0
Loading: Rostlab/ProstT5


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[[-0.06506   0.05774  -0.01139  ...  0.02162  -0.0612   -0.06143 ]
 [-0.061     0.07007   0.02899  ...  0.02339  -0.04446  -0.0517  ]
 [-0.04413   0.04855   0.02495  ...  0.01591  -0.04044  -0.0746  ]
 ...
 [-0.02856   0.02895   0.01654  ... -0.003574 -0.02275  -0.0258  ]
 [-0.013054  0.05884  -0.006424 ... -0.006264 -0.03583  -0.0313  ]
 [-0.07      0.07916   0.00418  ... -0.009865 -0.03995  -0.05136 ]]
669
1024
[-0.0336    0.03876   0.00395  ... -0.002712 -0.04007  -0.06192 ]
