#Taking outputs from ABodyBuilder and MAPT, and embedding associated antibody sequences in Ankh and Ablang.

Ablang heavy and light chain embeddings will be concatenated along the row axis, and the combination of features will be used for a regression task to predict hydrophobicity and MAPT score.

In [1]:
#@title Mount drive and load libraries
%%capture
!pip3 install torch torchvision torchaudio transformers sentencepiece accelerate --extra-index-url https://download.pytorch.org/whl/cu116
!python -m pip install ankh

import os
import pandas as pd, numpy as np
from google.colab import drive

drive.mount('/content/drive/')
path = '/content/drive/MyDrive/msc-project-mbalmf01'
os.chdir(path)

In [2]:
df = pd.read_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230716_human_paired_seqs.csv', dtype={'Run':np.str_}, index_col=0)
print(df.columns)

Index(['Unnamed: 1', 'sequence_id_heavy', 'ANARCI_status_heavy',
       'sequence_heavy', 'sequence_alignment_aa_heavy', 'v_call_heavy',
       'd_call_heavy', 'j_call_heavy', 'sequence_id_light',
       'ANARCI_status_light', 'sequence_light', 'v_call_light', 'j_call_light',
       'sequence_alignment_aa_light', 'Run', 'seq_id', 'scfv'],
      dtype='object')


In [3]:
mapt = pd.read_csv('MAPT/all_aggpred_scores_2023.08.16.csv')
print(mapt.describe())
print(mapt.columns)

          Acidics Fv      Basics Fv  Charge pH5 Fv   Charge pH5 H  \
count  102496.000000  102496.000000  102496.000000  102434.000000   
mean       17.014664      18.178280       2.775484       1.995968   
std         2.518979       2.142618       2.668971       2.221373   
min         6.000000       5.000000     -12.000000     -11.000000   
25%        15.000000      17.000000       1.000000       1.000000   
50%        17.000000      18.000000       3.000000       2.000000   
75%        19.000000      20.000000       5.000000       3.000000   
max        29.000000      28.000000      14.000000      14.000000   

        Charge pH5 L  Charge pH7 Fv   Charge pH7 H   Charge pH7 L  \
count  102495.000000  102496.000000  102434.000000  102495.000000   
mean        0.780731       1.163616       1.124783       0.039514   
std         1.878264       2.511273       2.084882       1.796477   
min       -10.000000     -13.000000     -12.000000     -10.000000   
25%         0.000000       0.0000

In [11]:
linker = 'SGGSTITSYNVYYTKLSSSGT'
l = [linker]*mapt.shape[0]
mapt['linker'] = l
mapt['scfv'] = mapt['Model Seq L'] + mapt['linker'] + mapt['Model Seq H']

df = df[df['scfv'].isin(mapt['scfv'].to_list())]

df.to_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230820_human_paired_seqs_MAPT.csv', index=None)

(98618, 17)

In [16]:
#@title PLM manipulation

from transformers import AutoModel, AutoTokenizer
import torch, pandas as pd, numpy as np
import re
import ankh, inspect

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

def start_ablang(ablang_model: str):
    tokenizer = AutoTokenizer.from_pretrained(f'qilowoq/{ablang_model}')
    model = AutoModel.from_pretrained(f'qilowoq/{ablang_model}', trust_remote_code=True)
    return model, tokenizer

def start_ankh(device):
    model, tokenizer = ankh.load_large_model()
    model.eval()
    model.to(device=device)
    return model, tokenizer

def chunks(l, n):
    '''
    splits a list into evenly sized chunks
    '''
    return [l[i:i + n] for i in range(0, len(l), n)]

def get_aa_embedding(sequence: str, model: str, tokenizer, max_length: int):
    '''
    Apply fun to a list of sequences using Ablang and generate per-residue embeddings
    '''
    sequence = ' '.join(sequence)
    encoded_input = tokenizer(sequence, padding='max_length', return_tensors='pt')
    model_output = model(**encoded_input)
    lhs = model_output.last_hidden_state
    return lhs.detach().numpy()

def process_seqs(seqs: list, model, tokenizer, device=device) -> list:
    seqs = [" ".join(list(re.sub(r"[UZOB]", "X", sequence))) for sequence in seqs]
    ids = tokenizer.batch_encode_plus(seqs, add_special_tokens=True, padding="longest")
    input_ids = torch.tensor(ids['input_ids']).to(device)
    attention_mask = torch.tensor(ids['attention_mask']).to(device)
    with torch.no_grad():
        embedding_repr = model(input_ids=input_ids,attention_mask=attention_mask)
    ember = embedding_repr.last_hidden_state
    ember = [i.mean(dim=0) for i in ember]
    return ember

def batch_embed(df: pd, prot_col: str, seq_id: str, batch_size: int, model, tokenizer) -> pd:
    l = []
    batch_size = 100
    for i in range(0, df.shape[0], batch_size):
        df1 = df.iloc[i:i + batch_size]
        batch = df1[prot_col].to_list()
        batch = process_seqs(seqs=batch, model=model, tokenizer=tokenizer)
        protein_embeddings_np = np.array([emb.cpu().numpy() for emb in batch])
        df_tensor = pd.DataFrame(protein_embeddings_np)
        df_tensor['seq_id'] = df1[seq_id].to_list()
        l.append(df_tensor)
    return pd.concat(l)

def get_sequence_embeddings(encoded_input, model_output):
    '''
    Taken from Ablang paper - may not work on other pLMs
    '''
    mask = encoded_input['attention_mask'].float()
    d = {k: v for k, v in torch.nonzero(mask).cpu().numpy()} # dict of sep tokens
    # make sep token invisible
    for i in d:
        mask[i, d[i]] = 0
    mask[:, 0] = 0.0 # make cls token invisible
    mask = mask.unsqueeze(-1).expand(model_output.last_hidden_state.size())
    sum_embeddings = torch.sum(model_output.last_hidden_state * mask, 1)
    sum_mask = torch.clamp(mask.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def redux_fit(model, components: int, randstate: int, data: pd, **kwargs) -> pd:
    np.random.seed(randstate)
    model_args = inspect.signature(model).parameters
    if 'method' in model_args:
        method = kwargs.pop('method', None)
        if method:
            redux = model(n_components=components, method=method).fit_transform(data)
        else:
            redux = model(n_components=components).fit_transform(data)
    else:
        redux = model(n_components=components).fit_transform(data)
    X=redux[:, 0]
    y=redux[:, 1]
    new_df = pd.DataFrame([X, y]).transpose()
    new_df.rename(columns={0:'X', 1:'y'}, inplace=True)
    return new_df

def run_ankh(df: pd, prot_col: str, seq_id: str) -> pd:
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("Using device: {}".format(device))

    model, tokenizer = start_ankh(device=device)
    tensor_df = batch_embed(df=df, prot_col=prot_col, seq_id=seq_id, batch_size=100, model=model, tokenizer=tokenizer)
    return tensor_df

def run_ablang(df: pd, prot_col: str, seq_id: str, ablang_model: str) -> pd:
    device = 'cpu'
    model, tokenizer = start_ablang(ablang_model=ablang_model)
    tensor_df = batch_embed(df=df, prot_col=prot_col, seq_id=seq_id, batch_size=100, model=model, tokenizer=tokenizer)
    return tensor_df

In [None]:
path = '/content/drive/MyDrive/msc-project-mbalmf01/msc-project-source-code-files-22-23-mbalmf01/notebooks'
os.chdir(path)

import torch
#from plm_manipulation import run_ankh, run_ablang, batch_embed

In [17]:
import time
def get_seq_embeddings(sequence: str, model: str, tokenizer, max_length: int):
    '''
    Apply fun to a list of sequences using Ablang and generate per-residue embeddings
    '''
    sequence = ' '.join(sequence)
    encoded_input = tokenizer(sequence, padding='max_length', return_tensors='pt')
    model_output = model(**encoded_input)
    mask = encoded_input['attention_mask'].float()
    d = {k: v for k, v in torch.nonzero(mask).cpu().numpy()} # dict of sep tokens
    # make sep token invisible
    for i in d:
        mask[i, d[i]] = 0
    mask[:, 0] = 0.0 # make cls token invisible
    mask = mask.unsqueeze(-1).expand(model_output.last_hidden_state.size())
    sum_embeddings = torch.sum(model_output.last_hidden_state * mask, 1)
    sum_mask = torch.clamp(mask.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def start_ablang(ablang_model: str):
    tokenizer = AutoTokenizer.from_pretrained(f'qilowoq/{ablang_model}')
    model = AutoModel.from_pretrained(f'qilowoq/{ablang_model}', trust_remote_code=True)
    return model, tokenizer

model, tokenizer = start_ablang('AbLang_Heavy')

sequences = df['sequence_alignment_aa_heavy'].to_list()
max_length = max([len(i) for i in sequences])
start = time.time()
embeddings = [get_seq_embeddings(seq, model, tokenizer, max_length) for seq in sequences]
end = time.time() - start
print(end)

Downloading (…)okenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/71.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/848 [00:00<?, ?B/s]

Downloading (…)ang_roberta_model.py:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/qilowoq/AbLang_Heavy:
- AbLang_roberta_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

OSError: ignored

In [21]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('qilowoq/AbLang_heavy')
model = AutoModel.from_pretrained('qilowoq/AbLang_heavy', trust_remote_code=True, from_tf=True)

sequence_Example = ' '.join("EVQLQESGPGLVKPSETLSLTCTVSGGPINNAYWTWIRQPPGKGLEYLGYVYHTGVTNYNPSLKSRLTITIDTSRKQLSLSLKFVTAADSAVYYCAREWAEDGDFGNAFHVWGQGTMVAVSSASTKGPSVFPLAPSSKSTSGGTAALGCL")
encoded_input = tokenizer(sequence_Example, return_tensors='pt')
model_output = model(**encoded_input)


OSError: ignored

In [None]:
device = 'cpu'
tensor_df = run_ablang(df=df, prot_col='sequence_alignment_aa_heavy', seq_id='seq_id', ablang_model='AbLang_heavy')
tensor_df.to_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230820_human_paired_seqs_MAPT_ablang_heavy.csv', index=None)
tensor_df = run_ablang(df=df, prot_col='sequence_alignment_aa_light', seq_id='seq_id', ablang_model='AbLang_light')
tensor_df.to_csv('/content/drive/MyDrive/msc-project-mbalmf01/all_paired/230811_human_paired_seqs_MAPT_ablang_light.csv', index=None)

Using device: cpu


Downloading (…)okenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/71.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/848 [00:00<?, ?B/s]

Downloading (…)ang_roberta_model.py:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/qilowoq/AbLang_heavy:
- AbLang_roberta_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

Using device: cpu


Downloading (…)okenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/71.0 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/848 [00:00<?, ?B/s]

Downloading (…)ang_roberta_model.py:   0%|          | 0.00/1.77k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/qilowoq/AbLang_light:
- AbLang_roberta_model.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

In [None]:
from google.colab import runtime

def disconnect_and_delete_runtime():
  """Disconnects and deletes the current runtime."""
  runtime.unassign()

disconnect_and_delete_runtime()