#Taking outputs from ABodyBuilder and MAPT, and embedding associated antibody sequences in Ankh and Ablang.

Ablang heavy and light chain embeddings will be concatenated along the row axis, and the combination of features will be used for a regression task to predict hydrophobicity and MAPT score.

In [None]:
#@title Mount drive and load libraries
%%capture
!pip3 install torch torchvision torchaudio transformers sentencepiece accelerate --extra-index-url https://download.pytorch.org/whl/cu116
!python -m pip install ankh

import os
import pandas as pd, numpy as np
from google.colab import drive

drive.mount('/content/drive/')

path = '/content/drive/MyDrive/msc-project-mbalmf01/msc-project-source-code-files-22-23-mbalmf01/notebooks'
os.chdir(path)

import torch
from plm_manipulation import run_ankh, run_ablang, batch_embed, start_ankh

In [None]:
df = pd.read_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230716_human_paired_seqs.csv', dtype={'Run':np.str_}, index_col=0)
mapt = pd.read_csv('MAPT/230811_MAPT_scores_abs.csv')

linker = 'SGGSTITSYNVYYTKLSSSGT'

mapt['scfv'] = mapt['Model Seq L'] + [linker]*mapt.shape[0] + mapt['Model Seq H']

df = df[df['scfv'].isin(mapt['scfv'].to_list())]

df.to_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230811_human_paired_seqs_MAPT.csv', index=None)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230811_human_paired_seqs_MAPT.csv', dtype={'Run':np.str_})

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

model, tokenizer = start_ankh(device=device)

prot_col='scfv'
seq_id='seq_id'

#Ankh embedding is GPU RAM intensive.
#Need to split the job up into batches of 5000, which in turn will be split into batches of 100
for i in range(0,df.shape[0], 5000):
  if i+5000 > df.shape[0]:
    tensor_df = batch_embed(df=df.iloc[i:df.shape[0],:], prot_col=prot_col, seq_id=seq_id, batch_size=100, model=model, tokenizer=tokenizer)
    tensor_df.to_csv(f'/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230811_human_paired_seqs_MAPT_ankh{i}.csv', index=None)
    torch.cuda.empty_cache()
    print('completed ankh embedding')
  else:
    tensor_df = batch_embed(df=df.iloc[i:i+5000,:], prot_col=prot_col, seq_id=seq_id, batch_size=100, model=model, tokenizer=tokenizer)
    tensor_df.to_csv(f'/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230811_human_paired_seqs_MAPT_ankh{i}.csv', index=None)
    torch.cuda.empty_cache()
    print(f'completed {i}')

In [None]:
from google.colab import runtime

def disconnect_and_delete_runtime():
  """Disconnects and deletes the current runtime."""
  runtime.unassign()

disconnect_and_delete_runtime()