#Taking outputs from MAPT, and embedding associated antibody sequences in Ankh

1.  Import/install dependencies and clone repo
2.  Read in sequence information
3.  Batch embed either heavy chain or ScFv sequences
4.  Terminate session

In [1]:
#@title Mount drive and load libraries
%%capture
token = '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'
username = 'mbalmf01'
repo = 'msc-project-source-code-files-22-23-mbalmf01'
!git clone https://{token}@github.com/Birkbeck/msc-project-source-code-files-22-23-mbalmf01
!mkdir /content/heavy
!mkdir /content/scfv

!pip3 install torch torchvision torchaudio transformers sentencepiece accelerate --extra-index-url https://download.pytorch.org/whl/cu116
!python -m pip install ankh

import os, sys
import pandas as pd, numpy as np, torch

sys.path.append('/content/msc-project-source-code-files-22-23-mbalmf01/scripts')

from plm_manipulation import start_ankh, run_ankh, batch_embed

In [4]:
#@title Read in sequences for embeddings and create new ScFv column for heavy and light
df = pd.read_parquet('/content/msc-project-source-code-files-22-23-mbalmf01/data_files/230816_aggpred_scores.parquet')
linker = 'SGGSTITSYNVYYTKLSSSGT'
df['scfv'] = df['Model Seq L'] + [linker]*df.shape[0] + df['Model Seq H']

In [6]:
#@title Load model and tokenizer
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model, tokenizer = start_ankh(device=device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/31.2k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/7.52G [00:00<?, ?B/s]

In [None]:
#@title Batch embedding heavy chain sequences
prot_col='Model Seq H'
seq_id='Filename'

#Ankh embedding is GPU RAM intensive.
#need to split the job up into batches of 5000, which in turn will be split into batches of 100
for i in range(0,df.shape[0], 5000):
  if i+5000 > df.shape[0]:
    tensor_df = batch_embed(df=df.iloc[i:df.shape[0],:], prot_col=prot_col, seq_id=seq_id, batch_size=100, model=model, tokenizer=tokenizer)
    tensor_df.to_csv(f'/content/heavy/ankh_embedding_H{i}.parquet')
    torch.cuda.empty_cache()
    print('completed ankh embedding')
  else:
    tensor_df = batch_embed(df=df.iloc[i:i+5000,:], prot_col=prot_col, seq_id=seq_id, batch_size=100, model=model, tokenizer=tokenizer)
    tensor_df.to_csv(f'/content/heavy/ankh_embedding_H{i}.parquet')
    torch.cuda.empty_cache()
    print(f'completed {i}')

frames = [pd.read_parquet(parq) for parq in os.listdir('/content/heavy')]
embeddings = pd.concat(frames)
embeddings.to_parquet('/content/heavy/ankh_embeddings_H.parquet')


In [None]:
#@title Batch embedding ScFv sequences
prot_col='scfv'
seq_id='Filename'

#Ankh embedding is GPU RAM intensive.
#need to split the job up into batches of 5000, which in turn will be split into batches of 100
for i in range(0,df.shape[0], 5000):
  if i+5000 > df.shape[0]:
    tensor_df = batch_embed(df=df.iloc[i:df.shape[0],:], prot_col=prot_col, seq_id=seq_id, batch_size=100, model=model, tokenizer=tokenizer)
    tensor_df.to_csv(f'/content/scfv/ankh_embedding_scfv{i}.parquet')
    torch.cuda.empty_cache()
    print('completed ankh embedding')
  else:
    tensor_df = batch_embed(df=df.iloc[i:i+5000,:], prot_col=prot_col, seq_id=seq_id, batch_size=100, model=model, tokenizer=tokenizer)
    tensor_df.to_csv(f'/content/scfv/ankh_embedding_scfv{i}.parquet')
    torch.cuda.empty_cache()
    print(f'completed {i}')

frames = [pd.read_parquet(parq) for parq in os.listdir('/content/scfv')]
embeddings = pd.concat(frames)
embeddings.to_parquet('/content/scfv/ankh_embeddings_scfv.parquet')

In [None]:
#@title Run if session is left unattended to minimise wasted resource
from google.colab import runtime

def disconnect_and_delete_runtime():
  """Disconnects and deletes the current runtime."""
  runtime.unassign()

disconnect_and_delete_runtime()