## Find atomic contacts between antibody and antigen pairs

We want to create a DataFrame that contains atomic contacts for all the structures in our cleaned summary file.

The desired result is a DataFrame with columns

- `pdb_id`, e.g. '9ds1'
- `ab_chain`, e.g. 'H' 
- `ab_chaintype`, 'heavy' or 'light'
- `ag_resnum`
- `ab_resnumi`, including icode, e.g. '52A'
- `ab_resname`
- `ab_atom`
- `ag_chain`, e.g. 'G'
- `ag_resnum`
- `ag_resnumi`, e.g. '13'
- `ag_resname`, e.g. 'TYR'
- `ag_atom`



We can accomplish this using our `atomic_contact_points` function from notebook 07. Copy this function from the previous notebook into a code cell.

- read the cleaned summary file into a pandas DataFrame
- for each row of the summary DataFrame
    - read the PDB file
    - find atomic contact points for Hchain
    - find atomic contact points for Lchain
    - add columns pbs_id, ab_chaintype ('heavy' and 'light')

and concatenate to a single DataFrame.

Import required libraries

In [1]:
import os.path
from Bio.PDB import PDBParser, NeighborSearch
import pandas as pd

In [52]:
def atomic_contact_points(ab_chain, ag_chain, distance):
    res = []
    ns = NeighborSearch(list(ag_chain.get_atoms()))
    for ab_atom in ab_chain.get_atoms():
        ab_res = ab_atom.get_parent()
        close_ag_atoms = ns.search(ab_atom.coord, distance)
        for ag_atom in close_ag_atoms:
            ag_res = ag_atom.get_parent()
            if ab_res.id[0] == ' ' and ag_res.id[0] == ' ':
                tmp = dict(ab_resnum = ab_res.id[1],
                           ab_resnumi = str(ab_res.id[1]) + ab_res.id[2],
                           ab_resname = ab_res.get_resname(),
                           ab_atom = ab_atom.id[0],
                           ag_resnum = ag_res.id[1],
                           ag_resnumi = str(ag_res.id[1]) + ag_res.id[2],
                           ag_resname = ag_res.get_resname(),
                           ag_atom = ag_atom.id[0])
                res.append(tmp)

    return pd.DataFrame(res)

In [56]:
def residue_occurrence(chain):
    results = []
    for res in chain.get_residues():
        if res.id[0] == ' ' and res.id[1] <= 128:
            tmp = dict(ab_resnum = str(res.id[1]) + res.id[2],
                   ab_resname = res.get_resname())
            results.append(tmp)

    return pd.DataFrame(results)

Define constants

In [53]:
SUMMARY_FILE = '../generated/preprocess/summary_pdb_clusters_deduplicated.tsv'
PDB_DIR = '../data/pdbs'


In [54]:
summary = pd.read_csv(SUMMARY_FILE, sep='\t')
summary.head()

Unnamed: 0,index,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,...,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid,Hcluster,Lcluster,duplicated
0,0,9ds2,G,I,0,D | C,protein | protein,NA | NA,hemagglutinin ha2 chain | hemagglutinin ha1 chai,VIRAL PROTEIN/IMMUNE SYSTEM,...,IGKV3,Kappa,,,,,,405.0,259.0,False
1,3,8uzp,H,L,0,M,protein,,stem_mimetic_01,IMMUNE SYSTEM,...,IGLV1,Lambda,,,,,,695.0,187.0,False
2,5,8veb,G,I,0,E,protein,,hemagglutinin,IMMUNE SYSTEM/VIRAL PROTEIN,...,IGKV1,Kappa,,,,,,126.0,723.0,False
3,8,8ved,H,L,0,A,protein,,hemagglutinin,IMMUNE SYSTEM/VIRAL PROTEIN,...,IGKV2,Kappa,,,,,,183.0,74.0,False
4,14,8vef,H,L,0,A,protein,,hemagglutinin,IMMUNE SYSTEM/VIRAL PROTEIN,...,IGKV1,Kappa,,,,,,127.0,412.0,False


In [57]:
contacts = pd.DataFrame()
residues = pd.DataFrame()

for i, row in summary.iterrows():
    pdb_id = row['pdb']
    hchain = row['Hchain']
    lchain = row['Lchain']
    antigen_chain = row['antigen_chain']

    try:
        if pd.isna(antigen_chain) or len(antigen_chain) > 1:
            continue

    
        filename = os.path.join(PDB_DIR, f'{pdb_id}_chothia.pdb')
        parser = PDBParser(PERMISSIVE=1)
        structure = parser.get_structure(pdb_id, filename)

        acph = atomic_contact_points(structure[0][hchain], structure[0][antigen_chain], 4.0)
        
        acph.insert(loc = 0, column = 'ag_chain', value = antigen_chain)
        acph.insert(loc = 0, column = 'ab_chain', value = hchain)
        acph.insert(loc = 0, column = 'chain_type', value = 'heavy')
        acph.insert(loc = 0, column = 'pdb_id', value = pdb_id)

        resh = residue_occurrence(structure[0][hchain])
        resh.insert(loc = 0, column = 'ab_chain', value = hchain)
        resh.insert(loc = 0, column = 'chain_type', value = 'heavy')
        resh.insert(loc = 0, column = 'pdb_id', value = pdb_id)


        acpl = atomic_contact_points(structure[0][lchain], structure[0][antigen_chain], 4.0)
        
        acpl.insert(loc = 0, column = 'ag_chain', value = antigen_chain)
        acpl.insert(loc = 0, column = 'ab_chain', value = lchain)
        acpl.insert(loc = 0, column = 'chain_type', value = 'light')
        acpl.insert(loc = 0, column = 'pdb_id', value = pdb_id)

        resl = residue_occurrence(structure[0][lchain])
        resl.insert(loc = 0, column = 'ab_chain', value = lchain)
        resl.insert(loc = 0, column = 'chain_type', value = 'light')
        resl.insert(loc = 0, column = 'pdb_id', value = pdb_id)

        residues = pd.concat([residues, resh, resl])
        contacts = pd.concat([contacts, acph, acpl])

    except Exception as e:
        print(e)
        print(row)
        continue




In [58]:
contacts.to_csv("../generated/contacts/atomic_contacts.tsv", sep='\t', index = False)
contacts.head()

Unnamed: 0,pdb_id,chain_type,ab_chain,ag_chain,ab_resnum,ab_resnumi,ab_resname,ab_atom,ag_resnum,ag_resnumi,ag_resname,ag_atom
0,8uzp,heavy,H,M,28.0,28,THR,C,95.0,95,ASN,O
1,8uzp,heavy,H,M,28.0,28,THR,C,95.0,95,ASN,N
2,8uzp,heavy,H,M,28.0,28,THR,O,95.0,95,ASN,O
3,8uzp,heavy,H,M,28.0,28,THR,O,95.0,95,ASN,N
4,8uzp,heavy,H,M,28.0,28,THR,O,95.0,95,ASN,C


In [59]:
residues.to_csv("../generated/contacts/residues.tsv", sep='\t', index = False)
residues.head()

Unnamed: 0,pdb_id,chain_type,ab_chain,ab_resnum,ab_resname
0,8uzp,heavy,H,1,GLN
1,8uzp,heavy,H,2,VAL
2,8uzp,heavy,H,3,GLN
3,8uzp,heavy,H,4,LEU
4,8uzp,heavy,H,5,VAL
