## Find atomic contacts between antibody and antigen pairs

We want to create a DataFrame that contains atomic contacts for all the structures in our cleaned summary file.

The desired result is a DataFrame with columns

- `pdb_id`, e.g. '9ds1'
- `ab_chain`, e.g. 'H' 
- `ab_chaintype`, 'heavy' or 'light'
- `ag_resnum`
- `ab_resnumi`, including icode, e.g. '52A'
- `ab_resname`
- `ab_atom`
- `ag_chain`, e.g. 'G'
- `ag_resnum`
- `ag_resnumi`, e.g. '13'
- `ag_resname`, e.g. 'TYR'
- `ag_atom`



We can accomplish this using our `atomic_contact_points` function from notebook 07. Copy this function from the previous notebook into a code cell.

- read the cleaned summary file into a pandas DataFrame
- for each row of the summary DataFrame
    - read the PDB file
    - find atomic contact points for Hchain
    - find atomic contact points for Lchain
    - add columns pbs_id, ab_chaintype ('heavy' and 'light')

and concatenate to a single DataFrame.

Import required libraries

In [74]:
import os.path
from Bio.PDB import PDBParser, NeighborSearch
import pandas as pd

In [75]:
def atomic_contact_points(ab_chain, ag_chain, distance):
    res = []
    ns = NeighborSearch(list(ag_chain.get_atoms()))
    for ab_atom in ab_chain.get_atoms():
        ab_res = ab_atom.get_parent()
        close_ag_atoms = ns.search(ab_atom.coord, distance)
        for ag_atom in close_ag_atoms:
            ag_res = ag_atom.get_parent()
            if ab_res.id[0] == ' ' and ag_res.id[0] == ' ':
                tmp = dict(ab_resnum = ab_res.id[1],
                           ab_icode = ab_res.id[2],
                           ab_resname = ab_res.get_resname(),
                           ab_atom = ab_atom.id[0],
                           ag_resnum = ag_res.id[1],
                           ag_icode = ag_res.id[2],
                           ag_resname = ag_res.get_resname(),
                           ag_atom = ag_atom.id[0])
                res.append(tmp)

    return pd.DataFrame(res)

In [76]:
def residue_occurrence(chain):
    results = []
    for res in chain.get_residues():
        if res.id[0] == ' ' and res.id[1] <= 128:
            tmp = dict(ab_resnum = res.id[1],
                       ab_icode = res.id[2],
                       ab_resname = res.get_resname())
            results.append(tmp)

    return pd.DataFrame(results)

Define constants

In [77]:
SUMMARY_FILE = '../generated/preprocess/summary_pdb_clusters_deduplicated.tsv'
PDB_DIR = '../data/pdbs'


In [78]:
summary = pd.read_csv(SUMMARY_FILE, sep='\t')
summary.head()

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid,Hcluster,Lcluster,duplicated
0,3hi6,H,L,0,A,protein,,integrin alpha-l,cell adhesion/immune system,05/19/09,...,IGKV1,Kappa,4.7e-06,-7.26,SPR,,19805116.0,856,219,False
1,2ny3,D,C,0,A,protein,,envelope glycoprotein gp120,VIRAL PROTEIN/IMMUNE SYSTEM,11/20/06,...,IGKV3,Kappa,8.15e-07,,SPR,,17301785.0,146,188,False
2,3bdy,H,L,0,V,protein,,vascular endothelial growth factor a,HORMONE,11/15/07,...,IGKV1,Kappa,3e-07,-8.89,SPR,,19299620.0,555,130,False
3,3mac,H,L,0,A,protein,,transmembrane glycoprotein,IMMUNE SYSTEM,03/23/10,...,IGLV3,Lambda,2e-07,-9.23,ITC,28.0,21085615.0,982,318,False
4,2wuc,H,L,0,A,protein,,hepatocyte growth factor activator long chain,HYDROLASE/HYDROLASE INHIBITOR,10/01/09,...,IGKV1,Kappa,1.61e-07,-9.26,SPR,25.0,20004165.0,1116,147,False


In [79]:
import warnings
import sys
from Bio.PDB.PDBExceptions import PDBConstructionWarning
warnings.simplefilter("error", PDBConstructionWarning)


contacts = pd.DataFrame()
residues = pd.DataFrame()

for i, row in summary.iterrows():
    pdb_id = row['pdb']
    if pdb_id == '7mtb':
        continue
    hchain = row['Hchain']
    lchain = row['Lchain']
    antigen_chain = row['antigen_chain']

    try:
        if pd.isna(antigen_chain) or len(antigen_chain) > 1:
            continue

    
        filename = os.path.join(PDB_DIR, f'{pdb_id}_chothia.pdb')
        parser = PDBParser(PERMISSIVE=1)
        structure = parser.get_structure(pdb_id, filename)

        acph = atomic_contact_points(structure[0][hchain], structure[0][antigen_chain], 4.0)
        
        acph.insert(loc = 0, column = 'ag_chain', value = antigen_chain)
        acph.insert(loc = 0, column = 'ab_chain', value = hchain)
        acph.insert(loc = 0, column = 'chain_type', value = 'heavy')
        acph.insert(loc = 0, column = 'pdb_id', value = pdb_id)

        resh = residue_occurrence(structure[0][hchain])
        resh.insert(loc = 0, column = 'ab_chain', value = hchain)
        resh.insert(loc = 0, column = 'chain_type', value = 'heavy')
        resh.insert(loc = 0, column = 'pdb_id', value = pdb_id)


        acpl = atomic_contact_points(structure[0][lchain], structure[0][antigen_chain], 4.0)
        
        acpl.insert(loc = 0, column = 'ag_chain', value = antigen_chain)
        acpl.insert(loc = 0, column = 'ab_chain', value = lchain)
        acpl.insert(loc = 0, column = 'chain_type', value = 'light')
        acpl.insert(loc = 0, column = 'pdb_id', value = pdb_id)

        resl = residue_occurrence(structure[0][lchain])
        resl.insert(loc = 0, column = 'ab_chain', value = lchain)
        resl.insert(loc = 0, column = 'chain_type', value = 'light')
        resl.insert(loc = 0, column = 'pdb_id', value = pdb_id)

        residues = pd.concat([residues, resh, resl])
        contacts = pd.concat([contacts, acph, acpl])

    except Exception as e:
        print(e)
        print(row)
        sys.exit(1)




In [80]:
contacts.to_csv("../generated/contacts/atomic_contacts.tsv", sep='\t', index = False)
contacts.head()

Unnamed: 0,pdb_id,chain_type,ab_chain,ag_chain,ab_resnum,ab_icode,ab_resname,ab_atom,ag_resnum,ag_icode,ag_resname,ag_atom
0,3hi6,heavy,H,A,30.0,,SER,O,269.0,,GLU,C
1,3hi6,heavy,H,A,30.0,,SER,O,269.0,,GLU,O
2,3hi6,heavy,H,A,31.0,,ARG,C,241.0,,GLU,O
3,3hi6,heavy,H,A,31.0,,ARG,C,241.0,,GLU,O
4,3hi6,heavy,H,A,31.0,,ARG,C,241.0,,GLU,O


In [81]:
residues.to_csv("../generated/contacts/residues.tsv", sep='\t', index = False)
residues.head()

Unnamed: 0,pdb_id,chain_type,ab_chain,ab_resnum,ab_icode,ab_resname
0,3hi6,heavy,H,1,,GLU
1,3hi6,heavy,H,2,,VAL
2,3hi6,heavy,H,3,,GLN
3,3hi6,heavy,H,4,,LEU
4,3hi6,heavy,H,5,,LEU
