#Generating per residue embeddings of heavy and light chains using Ablang

Sequences are loaded into the notebook, and then split into batches to avoid saturating the RAM. Amino acid embeddings are generated for each batch, which are then saved to disk as numpy arrays for later use.


In [1]:
#@title Mount drive and load libraries
%%capture
!pip3 install torch torchvision torchaudio transformers sentencepiece accelerate --extra-index-url https://download.pytorch.org/whl/cu116
!python -m pip install ankh

import os, gc
import pandas as pd, numpy as np, torch
from transformers import AutoModel, AutoTokenizer
from google.colab import drive

drive.mount('/content/drive/')
path = '/content/drive/MyDrive/msc-project-mbalmf01/msc-project-source-code-files-22-23-mbalmf01/notebooks'
os.chdir(path)

from plm_manipulation import chunks, get_aa_embedding

In [2]:
#@title Load in sequences for embedding
df = pd.read_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230813_human_paired_seqs_MAPT_scores.csv', dtype={'Run':np.str_})
df.columns

Index(['Unnamed: 0', 'Unnamed: 1', 'sequence_id_heavy', 'ANARCI_status_heavy',
       'sequence_heavy', 'sequence_alignment_aa_heavy', 'v_call_heavy',
       'd_call_heavy', 'j_call_heavy', 'sequence_id_light',
       'ANARCI_status_light', 'sequence_light', 'v_call_light', 'j_call_light',
       'sequence_alignment_aa_light', 'Run', 'seq_id', 'scfv', 'Acidics Fv',
       'Basics Fv', 'Charge pH5 Fv', 'Charge pH5 H', 'Charge pH5 L',
       'Charge pH7 Fv', 'Charge pH7 H', 'Charge pH7 L', 'Filename',
       'Hydrophobic Surface Fv', 'Hydrophobic Surface H',
       'Hydrophobic Surface L', 'MAPSS IgG1 pH5.0', 'MAPSS IgG1 pH7.4',
       'MAPSS IgG4P pH5.0', 'MAPSS IgG4P pH7.4', 'Model Seq L'],
      dtype='object')

In [None]:
#@title Load the model and tokenizer from Ablang_heavy
%%capture
tokenizer = AutoTokenizer.from_pretrained('qilowoq/AbLang_heavy')
model = AutoModel.from_pretrained('qilowoq/AbLang_heavy', trust_remote_code=True)

In [4]:
#@title Heavy chain per amino acid embeddings
seqs = df['sequence_alignment_aa_heavy'].to_list()
max_length = max([len(seq) for seq in seqs])

seqs_chunked = chunks(seqs, len(seqs) // 10)

for num, seq in enumerate(seqs_chunked):
  embeddings = [get_aa_embedding(s, model, tokenizer, max_length) for s in seq]
  embeddings = np.concatenate(embeddings)
  #save to drive
  filepath = f'/content/drive/MyDrive/msc-project-mbalmf01/embeddings/230819_ablang_H_aa_embeddings_{num}.npy'
  np.save(filepath, embeddings)
  #clear the RAM
  del embeddings
  gc.collect()

In [6]:
file_size = os.path.getsize('/content/drive/MyDrive/msc-project-mbalmf01/embeddings/230819_ablang_H_aa_embeddings_0.npy')
print(f'file size of each embedding numpy array is {round(file_size/1073741824, 3)}GB')

file size of each embedding numpy array is 2.682GB


In [None]:
#@title Repeat for light chain embeddings
tokenizer = AutoTokenizer.from_pretrained('qilowoq/AbLang_light')
model = AutoModel.from_pretrained('qilowoq/AbLang_light', trust_remote_code=True)

seqs = df['sequence_alignment_aa_light'].to_list()
max_length = max([len(seq) for seq in seqs])

for num, seq in enumerate(seqs_chunked):
  embeddings = [get_aa_embedding(s, model, tokenizer, max_length) for s in seq]
  embeddings = np.concatenate(embeddings)
  filepath = f'/content/drive/MyDrive/msc-project-mbalmf01/embeddings/230819_ablang_L_aa_embeddings_{num}.npy'
  np.save(filepath, embeddings)
  del embeddings
  gc.collect()