#Implementation of Ablang from Deane lab.
Analysis of antibody heavy chains using Ablang embeddings, dimension reduction techniques, and kmeans to replicate findings from Ablang paper and further explore embedding space.

In [1]:
#@title Mount drive and load libraries
%%capture
!pip3 install torch transformers scikit-learn
!pip install umap-learn[plot]

import os
from google.colab import drive
import torch
import pandas as pd, numpy as np

drive.mount('/content/drive/')
path = '/content/drive/MyDrive/msc-project-mbalmf01/msc-project-source-code-files-22-23-mbalmf01/notebooks'
os.chdir(path)

from plm_manipulation import start_ablang, process_seqs, batch_embed, redux_fit

In [None]:
#@title Load in model and tokenizer from Ablang Heavy model
model, tokenizer = start_ablang('AbLang_heavy')

Downloading (…)okenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/71.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/848 [00:00<?, ?B/s]

Downloading (…)ang_roberta_model.py:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

In [6]:
#@title Subset all_paired dataframe to contain only those sequences modelled for MAPS score
df = pd.read_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230811_human_paired_seqs.csv', dtype={'Run':np.str_}, index_col=0)
mapt = pd.read_csv('/content/drive/MyDrive/msc-project-mbalmf01/MAPT/all_aggpred_scores_2023.08.16.csv')
linker = 'SGGSTITSYNVYYTKLSSSGT'
l = [linker]*mapt.shape[0]
mapt['linker'] = l
mapt['scfv'] = mapt['Model Seq L'] + mapt['linker'] + mapt['Model Seq H']

df = df[df['scfv'].isin(mapt['scfv'].to_list())]
df = df.merge(right=mapt, how='left', on='scfv')
df.to_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230820_human_paired_seqs_MAPT.csv', index=None)

In [None]:
#@title Read in source antibody dataframe and the Ablang heavy chain embeddings
df = pd.read_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230820_human_paired_seqs_MAPT.csv', dtype={'Run':np.str_}, index_col=0)
tensor_df = batch_embed(df=df, prot_col='sequence_alignment_aa_heavy', seq_id='seq_id', batch_size=100, model=model, tokenizer=tokenizer)
tensor_df.to_csv('/content/drive/MyDrive/msc-project-mbalmf01/embeddings/230820_ablang_H_embeddings_mapt.csv')

In [None]:
#@title Load in model and tokenizer from Ablang Light model
model, tokenizer = start_ablang('AbLang_light')
tensor_df = batch_embed(df=df, prot_col='sequence_alignment_aa_light', seq_id='seq_id', batch_size=100, model=model, tokenizer=tokenizer)
tensor_df.to_csv('/content/drive/MyDrive/msc-project-mbalmf01/embeddings/230820_ablang_L_embeddings_mapt.csv')

Downloading (…)okenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/71.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/848 [00:00<?, ?B/s]

Downloading (…)ang_roberta_model.py:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

In [None]:
from google.colab import runtime

def disconnect_and_delete_runtime():
  """Disconnects and deletes the current runtime."""
  runtime.unassign()

disconnect_and_delete_runtime()