# Unlocking denovo antibody design
* https://www.biorxiv.org/content/10.1101/2023.01.08.523187v4.full.pdf

References for Trastuzamab:
* https://www.genome.jp/entry/D03257
* https://www.rcsb.org/structure/5xhg



In [37]:
import pandas as pd
import numpy as np
# from abnumber import Chain

In [38]:
trast_hc = 'EVQLVESGGGLVQPGGSLRLSCAASGFNIKDTYIHWVRQAPGKGLEWVARIYPTNGYTRYADSVKGRFTISADTSKNTAYLQMNSLRAEDTAVYYCSRWGGDGFYAMDYWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTH'
trast_lc = 'DIQMTQSPSSLSASVGDRVTITCRASQDVNTAVAWYQQKPGKAPKLLIYSASFLYSGVPSRFSGSRSGTDFTLTISSLQPEDFATYYCQQHYTTPPTFGQGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC'
her2 = 'TQVCTGTDMKLRLPASPETHLDMLRHLYQGCQVVQGNLELTYLPTNASLSFLQDIQEVQGYVLIAHNQVRQVPLQRLRIVRGTQLFEDNYALAVLDNGDPLNNTTPVTGASPGGLRELQLRSLTEILKGGVLIQRNPQLCYQDTILWKDIFHKNNQLALTLIDTNRSRACHPCSPMCKGSRCWGESSEDCQSLTRTVCAGGCARCKGPLPTDCCHEQCAAGCTGPKHSDCLACLHFNHSGICELHCPALVTYNTDTFESMPNPEGRYTFGASCVTACPYNYLSTDVGSCTLVCPLHNQEVTAEDGTQRCEKCSKPCARVCYGLGMEHLREVRAVTSANIQEFAGCKKIFGSLAFLPESFDGDPASNTAPLQPEQLQVFETLEEITGYLYISAWPDSLPDLSVFQNLQVIRGRILHNGAYSLTLQGLGISWLGLRSLRELGSGLALIHHNTHLCFVHTVPWDQLFRNPHQALLHTANRPEDECVGEGLACHQLCARGHCWGPGPTQCVNCSQFLRGQECVEECRVLQGLPREYVNARHCLPCHPECQPQNGSVTCFGPEADQCVACAHYKDPPFCVARCPSGVKPDLSYMPIWKFPDEEGACQPCPINCTHSCVDLDDKGCPAEQRASPLT'

In [39]:
binders = pd.read_csv('./zero-shot-binders_denovo.csv', index_col = 0)

In [41]:
binders = binders[binders['KD (nM)'] < 100]
binders.shape

(337, 7)

In [42]:
# On GitHub, they provide the CDR sequence for Trastuzumab:
hcdr3 = 'SRWGGDGFYAMDY'

In [43]:
# Since they only provide CDRH3s in the csv file, I am loading in Trastuzumab (the template antibody) to get the rest of the sequence
binders = binders.reset_index()
binders['HC'] = trast_hc
binders['LC'] = trast_lc

In [44]:
binders['HC'] = binders.apply(lambda x: x['HC'].replace(hcdr3, x['HCDR3']), axis=1)

In [31]:
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, fcluster
import Levenshtein as lev

def levenshtein_distance(u, v):
    return lev.distance(u[0], v[0])

# def levenshtein_distance(str1: str, str2: str) -> float:
#     return Lev.distance(str1, str2)  / max(len(str1), len(str2))

def hierarchical_clustering(strings, max_distance):
    """Perform hierarchical clustering on strings based on Levenshtein distance."""
    # Convert the list of strings to a 2D NumPy array of object type for compatibility with pdist
    strings_array = np.array(strings, dtype=object).reshape(-1, 1)
    
    # Compute the condensed Levenshtein distance matrix
    condensed_distance_matrix = pdist(strings_array, levenshtein_distance)
    
    # Perform hierarchical clustering
    Z = linkage(condensed_distance_matrix, 'complete')
    
    # Form clusters based on the specified max Levenshtein distance
    clusters = fcluster(Z, max_distance, criterion='distance')
        
    return  pd.DataFrame({'CDRH3': strings, 'Cluster': clusters})

In [32]:
# Clustering the CDRH3s to remove highly similar antibodies
max_distance = 2  # Maximum Levenshtein distance within a cluster
clone_df = hierarchical_clustering(binders['HCDR3'], max_distance)

In [33]:
binders['Clone ID'] = clone_df['Cluster']

Maximum edit distance of 2 within clusters

In [34]:
strongest_binders_by_clone = binders.groupby(['Clone ID']).apply(lambda x: x['KD (nM)'].idxmin()).values
subset_binders = binders.loc[strongest_binders_by_clone]
subset_binders.index = 'absci:' + subset_binders.index.astype(str)

In [35]:
seq_df = subset_binders[['HC', 'LC']]
seq_df.columns = ['VH_AA', 'VL_AA']
seq_df['antigen_seq'] = her2 
seq_df['antigen_name'] = 'HER2'
seq_df['source'] = 'Absci_denovo'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seq_df['antigen_seq'] = her2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seq_df['antigen_name'] = 'HER2'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seq_df['source'] = 'Absci_denovo'


In [46]:
seq_df.to_csv('absci_denovo-HER2_binders_cleaned_24-03-05.csv')