In [678]:
import pandas as pd
import numpy as np

# CovAbDAB filtering critera
* Selected length(VH) and length(VL) > 100 AA
* Removed non-human origins 
* For antibody clones with identical VH genes and CDRH3s, randomly selected one entry
* For RBD-binding antibodies, only selected neutralizing
* If >3 variants listed, randomly selected 3
* Included any NTD binders

Need to make sure that held-out test data does not include our original training data

In [679]:
processed_data = {}

In [680]:
covabdab = pd.read_csv('./CoV-AbDab_080224.csv')
covabdab.index =  covabdab['Name'] + ':' + covabdab.index.astype(str)
print(covabdab.shape)

(12918, 23)


In [681]:
# Basic filtering
covabdab = covabdab[covabdab['Ab or Nb'] == 'Ab']
covabdab = covabdab[~covabdab['Protein + Epitope'].isna()]


covabdab = covabdab[~covabdab['VHorVHH'].isna()]
covabdab = covabdab[~covabdab['VL'].isna()]

covabdab = covabdab[covabdab['VHorVHH'].apply(len) > 100]
covabdab = covabdab[covabdab['VL'].apply(len) > 100]

covabdab = covabdab[~covabdab['Origin'].isna()]

covabdab.shape

(10508, 23)

In [682]:
drop_origins = ['TBC', 'Transgenic Mice (Xenomouse)','Phage Display (scFv, chicken, immune - CoV1)',
 'B-cells; Transgenic Mouse (VelocImmune)','Chimeric (Human V, Mouse C)', 'Transgenic Mouse','Transgenic Mouse (Alloy GK)',
 'Phage Display (Ab, human, semisynthetic)', 'Hybridoma from Transgenic Mouse (VelocImmune)',
 'Phage Display (Humanised sdAbs, immune - CoV2)', 'Phage Library Engineered from SARS-CoV-1 binder', 'Phage Display (single-domain, human, non-immune)',
 'Humanised from Immunised Mouse', 'mRNA Display Library', 'Immunised Rhesus Macaques', 'Immunised mouse (RenMab)', 'Transgenic Mouse (H2L2)', 'Humanised from Mouse',
 'Immunised Camelids + Engineering', 'Unknown', 'Computational engineering from another binder', 'Transgenic Mice (VelocImmune)', 'Bispecific Therapeutic',
 'Engineered from CR3025', 'Engineered from CR3026', 'Engineered from CR3027','Engineered from CR3028', 'Engineered from CR3029', 'Engineered from CR3030', 'Engineered from Phage Display (antibody, non-immune)',
 'Engineered from CR3046', 'Humanised from R58', 'Phage Display Library (scFv, naive, non-immune)', 'Phage Display (Antibody, human, single-chain)', 'Immunised Mouse (H2L2)', 'Engineered from D27',
 'Humanised Mouse', 'Immunised Mouse (Balb/c)', 'Engineered from CR3024', 'Engineered from ADI-56046', 'Engineered from ADG-2', 'Transgenic Mice (HuMab)', 'Engineered from ADI-55688', 'Immunised mouse (TC-mAb)',
 'Immunised Humanised Mouse', 'Engineered from ADI-55689', 'Immunised Mouse', 'Immunised Transgenic Mouse', 'Engineered from CR3045', 'Engineered from CR3044', 'Engineered from CR3043', 'Engineered from CR3042',
 'Engineered from CR3041', 'Engineered from CR3023', 'Engineered from CR3040', 'Engineered from CR3039', 'Engineered from CR3037', 'Engineered from CR3036', 'Engineered from CR3035', 'Engineered from CR3034',
 'Engineered from CR3033', 'Engineered from CR3032', 'Engineered from CR3031', 'Engineered from CR3022', 'Computational Design', 'Engineered from CR3038']

In [683]:
covabdab = covabdab[covabdab['Heavy V Gene'].apply(lambda x: 'Human' in x)]

In [684]:
# Keeping only human Abs for now, left in yeast display but removed fully engineered
print(covabdab.shape)
covabdab = covabdab[~covabdab['Origin'].isin(drop_origins)]
print(covabdab.shape)

(10282, 23)
(10042, 23)


Clustering to limit number of high similar antibodies included

In [685]:
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, fcluster
import Levenshtein as lev

def levenshtein_distance(u, v):
    return lev.distance(u[0], v[0])

# def levenshtein_distance(str1: str, str2: str) -> float:
#     return Lev.distance(str1, str2)  / max(len(str1), len(str2))

def hierarchical_clustering(strings, max_distance):
    """Perform hierarchical clustering on strings based on Levenshtein distance."""
    # Convert the list of strings to a 2D NumPy array of object type for compatibility with pdist
    strings_array = np.array(strings, dtype=object).reshape(-1, 1)
    
    # Compute the condensed Levenshtein distance matrix
    condensed_distance_matrix = pdist(strings_array, levenshtein_distance)
    
    # Perform hierarchical clustering
    Z = linkage(condensed_distance_matrix, 'complete')
    
    # Form clusters based on the specified max Levenshtein distance
    clusters = fcluster(Z, max_distance, criterion='distance')
        
    return  pd.DataFrame({'CDRH3': strings, 'Cluster': clusters})

max_distance = 0  # Maximum Levenshtein distance within a cluster
clone_df = hierarchical_clustering(covabdab['CDRH3'], max_distance)

In [686]:
clone_df = clone_df.sample(frac=1, random_state=42)
clone_df['VH_gene'] = covabdab['Heavy V Gene']

# Ensuring that we keep any clones with different V genes
clone_df = clone_df.drop_duplicates(subset=(['VH_gene', 'Cluster']), keep='first')

In [687]:
clone_df[clone_df['Cluster'] == 6430] # Looking at a cluster

Unnamed: 0,CDRH3,Cluster,VH_gene
BD55-1451:3082,ARDLVVYGMDV,6430,IGHV3-53 (Human)
ADI-75716:2727,ARDLVVYGMDV,6430,IGHV3-66 (Human)
P2C-1F11:9794,ARDLVVYGMDV,6430,IGHV3-11 (Human)


In [688]:
clone_df.index.is_unique

True

In [689]:
covabdab = covabdab.loc[clone_df.index]
covabdab.shape

(9326, 23)

### Variant sequences from Toma

In [690]:
variant_rbd_list = dict(np.load('./RBD_SA.npy'))
variant_ntd_list = dict(np.load('./NTD_SA.npy'))

In [691]:
# [i for i in list(variant_list.keys()) if 'BA.4' in i]

In [692]:
### SARS-2_WT RBD
# https://www.rcsb.org/structure/6m0j
wt_rbd = 'RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNF'

In [693]:
ant_dict = {'SARS-CoV2_Alpha': variant_rbd_list['Alpha_B.1.1.7_Alpha_baseline__EPI_ISL_1000001'],
            'SARS-CoV2_Beta': variant_rbd_list['Beta_B.1.351_Beta_baseline__EPI_ISL_1005538'],
            'SARS-CoV2_Delta': variant_rbd_list['Delta_B.1.617.2_Delta_baseline__EPI_ISL_10004745'],
            'SARS-CoV2_Gamma': variant_rbd_list['Gamma_P.1_Gamma_baseline__EPI_ISL_1000993'],
            'SARS-CoV2_Omicron-BA1':variant_rbd_list['Omicron_BA.1_baseline__EPI_ISL_10000028'],
            'SARS-CoV2_Omicron-BA2':variant_rbd_list['Omicron_BA.2_Omicron_baseline__EPI_ISL_10000005'],
            'SARS-CoV2_Omicron-BA3':variant_rbd_list['Omicron_BA.3_baseline__EPI_ISL_10007901'],
            'SARS-CoV2_Omicron-BA4/5':variant_rbd_list['Omicron_BA.4_or_BA.5_baseline__EPI_ISL_11207535'], 
            'SARS-CoV2_Omicron-BA5':variant_rbd_list['Omicron_BA.4_or_BA.5_baseline__EPI_ISL_11207535'],
            'SARS-CoV2_WT':wt_rbd}

# https://www.uniprot.org/uniprotkb/P59594/entry#sequences
cov1_spike = 'MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEIFRSDTLYLTQDLFLPFYSNVTGFHTINHTFGNPVIPFKDGIYFAATEKSNVVRGWVFGSTMNNKSQSVIIINNSTNVVIRACNFELCDNPFFAVSKPMGTQTHTMIFDNAFNCTFEYISDAFSLDVSEKSGNFKHLREFVFKNKDGFLYVYKGYQPIDVVRDLPSGFNTLKPIFKLPLGINITNFRAILTAFSPAQDIWGTSAAAYFVGYLKPTTFMLKYDENGTITDAVDCSQNPLAELKCSVKSFEIDKGIYQTSNFRVVPSGDVVRFPNITNLCPFGEVFNATKFPSVYAWERKKISNCVADYSVLYNSTFFSTFKCYGVSATKLNDLCFSNVYADSFVVKGDDVRQIAPGQTGVIADYNYKLPDDFMGCVLAWNTRNIDATSTGNYNYKYRYLRHGKLRPFERDISNVPFSPDGKPCTPPALNCYWPLNDYGFYTTTGIGYQPYRVVVLSFELLNAPATVCGPKLSTDLIKNQCVNFNFNGLTGTGVLTPSSKRFQPFQQFGRDVSDFTDSVRDPKTSEILDISPCSFGGVSVITPGTNASSEVAVLYQDVNCTDVSTAIHADQLTPAWRIYSTGNNVFQTQAGCLIGAEHVDTSYECDIPIGAGICASYHTVSLLRSTSQKSIVAYTMSLGADSSIAYSNNTIAIPTNFSISITTEVMPVSMAKTSVDCNMYICGDSTECANLLLQYGSFCTQLNRALSGIAAEQDRNTREVFAQVKQMYKTPTLKYFGGFNFSQILPDPLKPTKRSFIEDLLFNKVTLADAGFMKQYGECLGDINARDLICAQKFNGLTVLPPLLTDDMIAAYTAALVSGTATAGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKQIANQFNKAISQIQESLTTTSTALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQAAPHGVVFLHVTYVPSQERNFTTAPAICHEGKAYFPREGVFVFNGTSWFITQRNFFSPQIITTDNTFVSGNCDVVIGIINNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYVWLGFIAGLIAIVMVTILLCCMTSCCSCLKGACSCGSCCKFDEDDSEPVLKGVKLHYT'

ant_dict['SARS-CoV1'] = cov1_spike

ant_dict['SARS-CoV2_Omicron-BA2.12.1'] = variant_rbd_list['Omicron_BA.2.12.1__BA.2_add[L452Q.S704L]__EPI_ISL_10783322']
ant_dict['SARS-CoV2_Omicron-BA1.1'] = variant_rbd_list['Omicron_BA.1.1__BA.1_add[R346K]__EPI_ISL_10000001']

# https://www.rcsb.org/structure/8ASY
ant_dict['SARS-CoV2_Omicron-BA2.75'] = 'TNLCPFHEVFNATRFASVYAWNRKRISNCVADYSVLYNFAPFFAFKCYGVSPTKLNDLCFTNVYADSFVIRGNEVSQIAPGQTGNIADYNYKLPDDFTGCVIAWNSNKLDSKVSGNYNYLYRLFRKSKLKPFERDISTEIYQAGNKPCNGVAGFNCYFPLQSYGFRPTYGVGHQPYRVVVLSFELLHAPATVCGKKSLLNDIFEAQKIEWHE'

# https://www.uniprot.org/uniprotkb/A0A6G6A1M4/entry
ant_dict['Pangolin-GD'] = 'MFVFLFVLPLVSSQCVNLTTRTGIPPGYTNSSTRGVYYPDKVFRSSILHLTQDLFLPFFSNVTWFNTINYQGGFKKFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDARTQSLLIVNNATNVVIKVCEFQFCTDPFLGVYYHNNNKTWVENEFRVYSSANNCTFEYISQPFLMDLEGKQGNFKNLREFVFKNVDGYFKIYSKHTPIDLVRDLPRGFAALEPLVDLPIGINITRFQTLLALHRSYLTPGKLESGWTTGAAAYYVGYLQQRTFLLSYNQNGTITDAVDCSLDPLSETKCTLKSLTVEKGIYQTSNFRVQPTISIVRFPNITNLCPFGEVFNASKFASVYAWNRKRISNCVADYSVLYNSTSFSTFKCYGVSPTKLNDLCFTNVYADSFVVKGDEVRQIAPGQTGVIADYNYKLPDDFTGCVIAWNSVKQDALTGGNYGYLYRLFRKSKLKPFERDISTEIYQAGSTPCNGQVGLNCYYPLERYGFHPTTGVNYQPFRVVVLSFELLNGPATVCGPKLSTTLVKDKCVNFNFNGLTGTGVLTTSKKQFLPFQQFGRDISDTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPMAIHAEQLTPAWRVYSAGANVFQTRAGCLVGAEHVNNSYECDIPVGAGICASYHSMSSFRSVNQRSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSIECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHEGKAHFPREGVFVSNGTHWFITQRNFYEPQIITTDNTFVSGSCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIIMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT'

# https://www.uniprot.org/uniprotkb/A0A6B9WHD3/entry#sequences
ant_dict['RaTG13'] = 'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSSTRGVYYPDKVFRSSVLHLTQDLFLPFFSNVTWFHAIHVSGTNGIKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPPGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTDSIVRFPNITNLCPFGEVFNATTFASVYAWNRKRISNCVADYSVLYNSTSFSTFKCYGVSPTKLNDLCFTNVYADSFVITGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSKHIDAKEGGNFNYLYRLFRKANLKPFERDISTEIYQAGSKPCNGQTGLNCYYPLYRYGFYPTDGVGHQPYRVVVLSFELLNAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNASNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSRSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGSCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIIMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT'
ant_dict['RatG13'] = 'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSSTRGVYYPDKVFRSSVLHLTQDLFLPFFSNVTWFHAIHVSGTNGIKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPPGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTDSIVRFPNITNLCPFGEVFNATTFASVYAWNRKRISNCVADYSVLYNSTSFSTFKCYGVSPTKLNDLCFTNVYADSFVITGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSKHIDAKEGGNFNYLYRLFRKANLKPFERDISTEIYQAGSKPCNGQTGLNCYYPLYRYGFYPTDGVGHQPYRVVVLSFELLNAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNASNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSRSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGSCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIIMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT'

In [694]:
len(list(ant_dict.keys()))

17

### Getting binding against RBD

In [696]:
covabdab_binders = covabdab.copy()
covabdab_binders = covabdab_binders[~covabdab_binders['Neutralising Vs'].isna()]
# covabdab_binders = covabdab_binders[~covabdab_binders['Binds to'].isna()]

# covabdab_binders = covabdab_binders[~covabdab_binders['Neutralising Vs'].apply(lambda x: 'weak' in x)]
covabdab_binders = covabdab_binders[covabdab_binders['Protein + Epitope'] == 'S; RBD']
print(covabdab_binders.shape)

(4513, 23)


In [697]:
covabdab_binders = covabdab_binders['Neutralising Vs'].str.split(';', expand=True)
# covabdab_binders = covabdab_binders['Binds to'].str.split(';', expand=True)

covabdab_binders = pd.DataFrame(covabdab_binders.stack())
covabdab_binders.columns = ['antigen_name']
covabdab_binders['ID'] = covabdab_binders.index.get_level_values(0)

covabdab_binders = covabdab_binders[~covabdab_binders['antigen_name'].apply(lambda x: 'weak' in x)]
print(covabdab_binders.shape)

(12211, 2)


In [698]:
# Map in antigen sequences and drop those that weren't included
covabdab_binders['antigen_seq'] = covabdab_binders['antigen_name'].map(ant_dict)
covabdab_binders.dropna(inplace=True)
covabdab_binders.shape

(10797, 3)

In [699]:
two_antigen_df2 = covabdab_binders.loc[covabdab_binders['ID'].value_counts()[covabdab_binders['ID'].value_counts() < 3].index]
print(two_antigen_df2.shape)
covabdab_binders.drop(two_antigen_df2.index, inplace=True)
print(covabdab_binders.shape)

(2728, 3)
(8069, 3)


In [700]:
# Randomly sampling from each occurance, so that we aren't training on same antibody multiple times. This is not perfect, but don't want too many redudant antibodies
subsampled_n3 = covabdab_binders.groupby('ID')['antigen_name'].sample(n=3, random_state=42).index

subsampled_rbd_binders = covabdab_binders.loc[subsampled_n3]
subsampled_rbd_binders.shape

(4533, 3)

In [701]:
two_antigen_df2.index = two_antigen_df2.index.droplevel(1)
subsampled_rbd_binders.index = subsampled_rbd_binders.index.droplevel(1)

In [702]:
rbd_binders = pd.concat([two_antigen_df2, subsampled_rbd_binders])

In [703]:
rbd_binders['Epitope'] = covabdab['Protein + Epitope']

In [704]:
rbd_binders['Epitope'].value_counts()

S; RBD    7261
Name: Epitope, dtype: int64

In [705]:
rbd_binders[['VH_AA', 'VL_AA']] = covabdab[['VHorVHH', 'VL']]
rbd_binders.shape

(7261, 6)

### Need to get NTD now

In [706]:
ntd_ant_dict = {'SARS-CoV2_Alpha': variant_ntd_list['Alpha_B.1.1.7_Alpha_baseline__EPI_ISL_1000001'],
            'SARS-CoV2_Beta': variant_ntd_list['Beta_B.1.351_Beta_baseline__EPI_ISL_1005538'],
            'SARS-CoV2_Delta': variant_ntd_list['Delta_B.1.617.2_Delta_baseline__EPI_ISL_10004745'],
            'SARS-CoV2_Gamma': variant_ntd_list['Gamma_P.1_Gamma_baseline__EPI_ISL_1000993'],
            'SARS-CoV2_Omicron-BA1':variant_ntd_list['Omicron_BA.1_baseline__EPI_ISL_10000028'],
            'SARS-CoV2_Omicron-BA2':variant_ntd_list['Omicron_BA.2_Omicron_baseline__EPI_ISL_10000005'],
            'SARS-CoV2_Omicron-BA3':variant_ntd_list['Omicron_BA.3_baseline__EPI_ISL_10007901'],
            'SARS-CoV2_Omicron-BA4/5':variant_ntd_list['Omicron_BA.4_or_BA.5_baseline__EPI_ISL_11207535'], 
            'SARS-CoV2_Omicron-BA5':variant_ntd_list['Omicron_BA.4_or_BA.5_baseline__EPI_ISL_11207535'],
            'SARS-CoV2_WT':variant_ntd_list['NC_045512_spike_surface_glycoprotein']}

In [707]:
covabdab_binders = covabdab.copy()
# covabdab_binders = covabdab_binders[~covabdab_binders['Neutralising Vs'].isna()]
covabdab_binders = covabdab_binders[~covabdab_binders['Binds to'].isna()]

covabdab_binders = covabdab_binders[covabdab_binders['Protein + Epitope'] == 'S; NTD']
print(covabdab_binders.shape)

(546, 23)


In [708]:
# covabdab_binders = covabdab_binders['Neutralising Vs'].str.split(';', expand=True)
covabdab_binders = covabdab_binders['Binds to'].str.split(';', expand=True)

covabdab_binders = pd.DataFrame(covabdab_binders.stack())
covabdab_binders.columns = ['antigen_name']
covabdab_binders['ID'] = covabdab_binders.index.get_level_values(0)

covabdab_binders = covabdab_binders[~covabdab_binders['antigen_name'].apply(lambda x: 'weak' in x)]
print(covabdab_binders.shape)

(1190, 2)


In [709]:
# Map in antigen sequences and drop those that weren't included
covabdab_binders['antigen_seq'] = covabdab_binders['antigen_name'].map(ntd_ant_dict)
covabdab_binders.dropna(inplace=True)
covabdab_binders.shape

(1112, 3)

In [710]:
two_antigen_df2 = covabdab_binders.loc[covabdab_binders['ID'].value_counts()[covabdab_binders['ID'].value_counts() < 3].index]
two_antigen_df2.index = two_antigen_df2.index.droplevel(1)
print(two_antigen_df2.shape)
covabdab_binders.drop(two_antigen_df2.index, inplace=True)
print(covabdab_binders.shape)

(443, 3)
(669, 3)


In [711]:
# Randomly sampling from each occurance, so that we aren't training on same antibody multiple times. This is not perfect, but don't want too many redudant antibodies
subsampled_n3 = covabdab_binders.groupby('ID')['antigen_name'].sample(n=3, random_state=42).index

subsampled_ntd_binders = covabdab_binders.loc[subsampled_n3]
subsampled_ntd_binders.index = subsampled_ntd_binders.index.droplevel(1)
subsampled_ntd_binders.shape

(531, 3)

In [712]:
ntd_binders = pd.concat([two_antigen_df2, subsampled_ntd_binders])
ntd_binders.shape

(974, 3)

In [713]:
ntd_binders['Epitope'] = covabdab['Protein + Epitope']
ntd_binders['Epitope'].value_counts()

S; NTD    974
Name: Epitope, dtype: int64

In [714]:
ntd_binders[['VH_AA', 'VL_AA']] = covabdab[['VHorVHH', 'VL']]
ntd_binders.shape

(974, 6)

### Getting full spike binders

In [715]:
covabdab_binders = covabdab.copy()
# covabdab_binders = covabdab_binders[~covabdab_binders['Neutralising Vs'].isna()]
covabdab_binders = covabdab_binders[~covabdab_binders['Binds to'].isna()]

covabdab_binders = covabdab_binders[covabdab_binders['Protein + Epitope'].isin(['S; Unk', 'S; non-RBD'])]
print(covabdab_binders.shape)

(2033, 23)


In [716]:
# covabdab_binders = covabdab_binders['Neutralising Vs'].str.split(';', expand=True)
covabdab_binders = covabdab_binders['Binds to'].str.split(';', expand=True)

covabdab_binders = pd.DataFrame(covabdab_binders.stack())
covabdab_binders.columns = ['antigen_name']
covabdab_binders['ID'] = covabdab_binders.index.get_level_values(0)

covabdab_binders = covabdab_binders[~covabdab_binders['antigen_name'].apply(lambda x: 'weak' in x)]
print(covabdab_binders.shape)

(2146, 2)


In [None]:
sars_cov2_spike = 'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSEPVLKGVKLHYT'
spike_dict = {'SARS-CoV2_WT':sars_cov2_spike, 'SARS-CoV1':cov1_spike}

In [717]:
# Map in antigen sequences and drop those that weren't included
covabdab_binders['antigen_seq'] = covabdab_binders['antigen_name'].map(spike_dict)
covabdab_binders.dropna(inplace=True)
covabdab_binders.shape

(1881, 3)

In [719]:
full_spike_binders = covabdab_binders.copy()
full_spike_binders.index = full_spike_binders.index.droplevel(1)
full_spike_binders['Epitope'] = covabdab['Protein + Epitope']
full_spike_binders[['VH_AA', 'VL_AA']] = covabdab[['VHorVHH', 'VL']]
full_spike_binders.shape

(1881, 6)

### Bringing together all data

In [720]:
all_data = pd.concat([rbd_binders, ntd_binders, full_spike_binders])

In [721]:
assert len(all_data.dropna()) == len(all_data)

In [722]:
all_data.index = 'covabdab:' +  all_data['antigen_name'] + ':' +  all_data['Epitope'] + ':' +  all_data.index

In [723]:
all_data.index.is_unique

True

In [724]:
all_data = all_data.drop_duplicates(subset=['ID', 'antigen_seq'])

In [725]:
all_data[['antigen_name', 'Epitope']].value_counts().to_clipboard()

In [726]:
print(all_data.shape)
all_data.head()

(10043, 6)


Unnamed: 0,antigen_name,ID,antigen_seq,Epitope,VH_AA,VL_AA
covabdab:SARS-CoV2_WT:S; RBD:TAU-2310:2364,SARS-CoV2_WT,TAU-2310:2364,RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVAD...,S; RBD,EVQLLESGGGLVQPGGSLRLSCAASGFTFSNYVMSWVRQAPGKGLE...,QSALTQPASVSGSPGQSITISCTGTSSDVGGYDYVSWYQQHPGKAP...
covabdab:SARS-CoV2_Alpha:S; RBD:TAU-2310:2364,SARS-CoV2_Alpha,TAU-2310:2364,NITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFST...,S; RBD,EVQLLESGGGLVQPGGSLRLSCAASGFTFSNYVMSWVRQAPGKGLE...,QSALTQPASVSGSPGQSITISCTGTSSDVGGYDYVSWYQQHPGKAP...
covabdab:SARS-CoV2_WT:S; RBD:BD56-557:3861,SARS-CoV2_WT,BD56-557:3861,RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVAD...,S; RBD,QITLKESGPTLVKPTQTLTLTCTFSGFSLNTRGVCVGWIRQPPGKA...,SYELTQPPSVSVSPGQTARITCSGHAFPNQYAYWYQQKPGQAPVLV...
covabdab:SARS-CoV1:S; RBD:BD56-557:3861,SARS-CoV1,BD56-557:3861,MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEI...,S; RBD,QITLKESGPTLVKPTQTLTLTCTFSGFSLNTRGVCVGWIRQPPGKA...,SYELTQPPSVSVSPGQTARITCSGHAFPNQYAYWYQQKPGQAPVLV...
covabdab:SARS-CoV2_WT:S; RBD:R616-1F10:10096,SARS-CoV2_WT,R616-1F10:10096,RVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVAD...,S; RBD,QVQLVESGGGVVQPGRSLRLSCAASRFPFSTYGMHWARRAPGKGLE...,QSVLTQPPSASATPGQRVTISCSGSSSNIGSNPVNWYQQLPGTAPK...


In [727]:
# all_data.to_csv('covabdab_input_seqs_v2_24-03-12.csv')