In [2]:
import pandas as pd
import numpy as np
from Levenshtein import distance
from abnumber import Chain

In [3]:
def lev_identity(seq1, seq2):
    lev_dist = distance(seq1, seq2)
    max_len = max(len(seq1), len(seq2))
    norm_dist = lev_dist / max_len
    norm_identity = 1 - norm_dist

    return norm_identity

In [4]:
import os

seq_df = pd.read_csv('RBD_1K_sequences_24-03-16.csv', index_col=0)

In [8]:
print(seq_df.shape)
seq_df = seq_df[seq_df['OASis Percentile'] >= 0.7]
print(seq_df.shape)
seq_df = seq_df[seq_df['v_identity.H'] >= 0.85]
print(seq_df.shape)

(3689, 51)
(3401, 51)
(3392, 51)


In [9]:
seq_df['Antigen'].value_counts()

Antigen
RBD     969
MPV     903
Ang2    853
VEGF    667
Name: count, dtype: int64

In [10]:
seq_df['v_gene.H'] = seq_df['v_gene.H'].apply(lambda x: x.split('*')[0])
seq_df['v_gene.L'] = seq_df['v_gene.L'].apply(lambda x: x.split('*')[0])

Selecting based on similarity to known Abs

In [11]:
sabdab= pd.read_csv('cleaned_SAbdab_seqs_24-01-11.csv', index_col=0)
sabdab_meta = pd.read_csv('sabdab_meta_data_24-01-11.csv', index_col=0)

def compare_to_sabdab(sabdab_seqs, seq):
    identites = sabdab_seqs.apply(lambda x: lev_identity(x, seq))
    return max(identites)
    
training_seqs = pd.read_csv('./annot_input_seqs_v1-24-03-12.csv', index_col=0)
covabdab_rbd = pd.read_csv('annotated_covabdab_rbd_24-01-30.csv', index_col=0)

In [12]:
antigen_seqs = {
                'RBD': 'RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNF',}

In [20]:
mpv_training_seqs['CDR3.H'] = mpv_training_seqs['HV_ab'].apply(lambda x: Chain(x, scheme='imgt', allowed_species='human', assign_germline=True).cdr3_seq)
mpv_training_seqs['v_gene.H'] = mpv_training_seqs['HV_ab'].apply(lambda x: Chain(x, scheme='imgt', allowed_species='human', assign_germline=True).v_gene)
mpv_training_seqs['v_gene.H'] = mpv_training_seqs['v_gene.H'].apply(lambda x: x.split('*')[0])
mpv_training_seqs['v_gene.L'] = mpv_training_seqs['LV_ab'].apply(lambda x: Chain(x, scheme='imgt', allowed_species='human', assign_germline=True).v_gene)
mpv_training_seqs['v_gene.L'] = mpv_training_seqs['v_gene.L'].apply(lambda x: x.split('*')[0])

In [22]:
top_similar_abs = {}

# for antigen in antigen_seqs.keys():
for antigen in ['RBD']:

    top_vh_ab_dict = {}
    top_cdr3_abs_dict = {}
    top_dict = {}
    rand_selected_candidates = {}

    ### Randomly selecting 2 candidates for top 5 V genes
    selected_candidates_all_antigens = []

    # Select generated sequences for this antigen
    ab_df = seq_df[seq_df['Antigen'] == antigen].copy()
    
    # Remove CDRH3 identical to training data
    ab_df = ab_df[~ab_df['CDR3.H'].isin(training_seqs['CDR3.H'])]

    # Remove germline
    ab_df = ab_df[ab_df['v_identity.H'] < 1]

    top5_vgenes = ab_df['v_gene.H'].value_counts().head(5).index

    for vgene in top5_vgenes:
        vgene_df = ab_df[ab_df['v_gene.H'] == vgene]

        # Subtracting 1 for zero indexing
        idx20= int(len(vgene_df) * 0.20) - 1
        idx80 = int(len(vgene_df) * 0.80) - 1

        vgene_df = vgene_df.sort_values(by='v_identity.H').iloc[[idx20,idx80]]

        selected_candidates_all_antigens.append(vgene_df.index)

    selected_candidates_all_antigens = np.concatenate(selected_candidates_all_antigens)

    ### Selecting top 10 most similar to known binding sequences based on VH and CDRH3 identity
    
    # Get matching antigens from SAbDab
    sabdab_seqs = sabdab.copy()
    sabdab_seqs['test_ID'] = sabdab['detagged_ant'].apply(lambda x: lev_identity(x, antigen_seqs[antigen]))
    sabdab_seqs = sabdab_seqs[sabdab_seqs['test_ID'] > 0.90]

    # Annotate SAbDab seqs for comparison
    sabdab_seqs['CDR3.H'] = sabdab_seqs['HV_ab'].apply(lambda x: Chain(x, scheme='imgt', allowed_species='human', assign_germline=True).cdr3_seq)
    sabdab_seqs['v_gene.H'] = sabdab_seqs['HV_ab'].apply(lambda x: Chain(x, scheme='imgt', allowed_species='human', assign_germline=True).v_gene)
    sabdab_seqs['v_gene.H'] = sabdab_seqs['v_gene.H'].apply(lambda x: x.split('*')[0])
    sabdab_seqs['v_gene.L'] = sabdab_seqs['LV_ab'].apply(lambda x: Chain(x, scheme='imgt', allowed_species='human', assign_germline=True).v_gene)
    sabdab_seqs['v_gene.L'] = sabdab_seqs['v_gene.L'].apply(lambda x: x.split('*')[0])

    if antigen == 'MPV':
        sabdab_seqs = pd.concat([sabdab_seqs, mpv_training_seqs])[['HV_ab', 'LV_ab', 'CDR3.H', 'v_gene.H', 'v_gene.L']]

    if antigen == 'RBD':
        sabdab_seqs = pd.concat([sabdab_seqs, covabdab_rbd])[['HV_ab', 'LV_ab', 'CDR3.H', 'v_gene.H', 'v_gene.L']]

   
    # Compare generated CDRH3s to SAbDab sequences, getting max identity for each generated sequence
    ab_df['CDRH3_sabdab_identity'] = ab_df['CDR3.H'].apply(lambda x: compare_to_sabdab(sabdab_seqs['CDR3.H'], x))
    # Compare generated VH to SAbDab sequences, getting max identity for each generated sequence
    ab_df['VH_sabdab_identity'] = ab_df['VH'].apply(lambda x: compare_to_sabdab(sabdab_seqs['HV_ab'], x))
    ab_df['VL_sabdab_identity'] = ab_df['VL'].apply(lambda x: compare_to_sabdab(sabdab_seqs['LV_ab'], x))


    # Remove these so we don't select redundant Abs
    top_dict['rand'] = ab_df.loc[selected_candidates_all_antigens]
    ab_df.drop(selected_candidates_all_antigens, inplace=True)

    # Remove Abs that are highly similar to known sequences 
    print(antigen)
    print(len(ab_df))
    ab_df = ab_df[~((ab_df['CDRH3_sabdab_identity'] >= 0.90) & (ab_df['VH_sabdab_identity'] >= 0.90))]
    print(len(ab_df))

    # Select top 5 antibodies by CDRH3 identity
    top_cdr3_abs = ab_df.sort_values(by='CDRH3_sabdab_identity', ascending=False).head(5).index
    top_dict['CDRH3'] = ab_df.loc[top_cdr3_abs]

    # Remove these so we don't select redundant Abs
    ab_df.drop(top_cdr3_abs, inplace=True)

    # Select top 5 antibodies by VH identity
    top_vh_abs = ab_df.sort_values(by='VH_sabdab_identity', ascending=False).head(5).index

    top_dict['VH'] = ab_df.loc[top_vh_abs]
    top_similar_abs[antigen] = top_dict

RBD
722
711


In [24]:
from abnumber import Chain

def compare_to_df(seq_df, seq):
    identites = seq_df.apply(lambda x: lev_identity(x, seq))
    return max(identites)

# Same as above, but return NaN if comparing to self
def compare_to_self(seq_df, seq):
    seq_df = seq_df[seq_df != seq]
    identites = seq_df.apply(lambda x: lev_identity(x, seq)) #  if x != seq) #else np.nan)
    return max(identites)

In [26]:
training_seqs = training_seqs[~training_seqs['CDR3.H'].isna()]

In [47]:
for antigen in top_similar_abs.keys():
    candidate_df = pd.concat(top_similar_abs[antigen])
    candidate_df['max_training_CDRH3_id'] = candidate_df['CDR3.H'].apply(lambda x: compare_to_df(training_seqs['CDR3.H'], x))
    candidate_df['max_training_VH_id'] = candidate_df['VH'].apply(lambda x: compare_to_df(training_seqs['VH_AA'], x))
    
    candidate_df['max_self_CDRH3_id'] = candidate_df['CDR3.H'].apply(lambda x: compare_to_self(candidate_df['CDR3.H'], x))
    candidate_df['max_self_VH_id'] = candidate_df['VH'].apply(lambda x: compare_to_self(candidate_df['VH'], x))

    candidate_df['all_cov_CDRH3_id'] = candidate_df['CDR3.H'].apply(lambda x: compare_to_df(covabdab['CDRH3'], x))
    candidate_df['all_cov_VH_id'] = candidate_df['VH'].apply(lambda x: compare_to_df(covabdab['VHorVHH'], x))

    candidate_df['LC_type'] = candidate_df['v_gene.L'].apply(lambda x: x.split('IG')[1].split('V')[0])

In [None]:
candidate_df.to_csv('selected_abs_for_testing.csv')