# Scripts for preparing metatdata for PDBBind-Opt and BioLiP2-Opt Dataset

In this script, we will create CSV files containing relevant metadata that will be used for the PDBBind-Opt workflow to process structures.

In [1]:
import pandas as pd
from tqdm import tqdm
import re
import requests
import math

from rdkit import Chem

In [2]:
def get_smiles_from_rcsb(comp_id: str):
    """
    Query ligand SMILES from RCSB

    Parameters
    ----------
    comp_id: str
        The ligand ID, usually a three-letter code
    
    Returns
    -------
    smi: str
        The SMILES of the query ligand. If fail to get, will return a vacant string
    """
    query = '''{chem_comp(comp_id: "%s") {
        rcsb_chem_comp_descriptor {
        SMILES_stereo SMILES InChI
        }
    }
    }''' % comp_id
    query = re.sub(r'\s+', ' ', query)
    try:
        res = requests.get('https://data.rcsb.org/graphql?query=' + query)
        smi = res.json()['data']['chem_comp']['rcsb_chem_comp_descriptor']['SMILES_stereo']
        if smi is None:
            smi = res.json()['data']['chem_comp']['rcsb_chem_comp_descriptor']['SMILES']
        if smi is None:
            m = Chem.MolFromInchi(res.json()['data']['chem_comp']['rcsb_chem_comp_descriptor']['InChI'])
            smi = Chem.MolToSmiles(m)
        assert smi is not None, "No reference smiles"
        return smi
    except:
        return ""


def regularize_binding_data(typ, sign, number, unit):
    
    # handle number that have uncertainty
    if '+-' in number:
        number = number.split('+-')[0]
    number = float(number)
    # handle sign
    sign = sign[1] + sign[0] if sign in ['=>', '=<'] else sign
    # convert Ka/Kb to Kd
    typ = typ.lower()
    if typ == 'ka' or typ == 'kb':
        typ = 'kd'
        assert unit.endswith('^-1'), f'Incorrect unit for Ka/Kb: {unit}'
        unit = unit.rstrip('^-1')
        number = 1 / number
    
    if unit == 'M':
        lognum = math.log10(number)
    elif unit == 'mM':
        lognum = math.log10(number) - 3
    elif unit == 'uM':
        lognum = math.log10(number) - 6
    elif unit == 'nM':
        lognum = math.log10(number) - 9
    elif unit == 'pM':
        lognum = math.log10(number) - 12
    elif unit == 'fM':
        lognum = math.log10(number) - 15
    else:
        lognum = None

    return {
        "measurement": typ,
        "sign": sign,
        "value": number,
        "unit": unit,
        "logvalue": lognum
    }

## Parse Original PDBBind Data

In [3]:

def parse_pdbbind_metadata(index='../raw/index/INDEX_general_PL.2020'):
    data = []
    with open(index) as f:
        for line in f:
            if line.startswith('#'):
                continue
            if line:
                content = line.strip().split()
                if not content[6].endswith(')'):
                    ligand = content[6][1:]
                else:
                    ligand = content[6][1:-1]
                    
                data.append({
                    "PDBID": content[0],
                    "Resolution": content[1],
                    "Year": content[2],
                    "Binding Affinity": content[3],
                    "Ligand": ligand.lstrip('_'),
                    "Note": ' '.join(content[7:])
                })
    data = pd.DataFrame(data)
    return data

pdbbind_data = parse_pdbbind_metadata('../raw_data_pdbbind/index/INDEX_general_PL.2020').set_index("PDBID").sort_index()
pdbbind_ids = pdbbind_data.index.unique()
print("Number of data in PDBBind v2020:", len(pdbbind_ids))
pdbbind_data

Number of data in PDBBind v2020: 19443


Unnamed: 0_level_0,Resolution,Year,Binding Affinity,Ligand,Note
PDBID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10gs,2.20,1998,Ki=0.4uM,VWW,
11gs,2.30,1999,Ki=1.5uM,2-mer,redundant to 3gss
13gs,1.90,1999,Ki=24uM,SAS,
16pk,1.60,1998,Ki=6uM,BIS,X-ray(Trypanosoma brucei PGK) while Ki(yeast PGK)
184l,1.80,1995,Kd=19uM,I4B,ligand is compound 39
...,...,...,...,...,...
966c,1.90,1999,Ki=23nM,RS2,
9abp,1.97,1992,Kd=0.01uM,GLA/GLB,isomer
9hvp,2.80,1992,Ki=4.5nM,0E9,
9icd,2.50,1991,Kd=125uM,NAP,


## Parse BioLiP

In [4]:
columns = [
    'PDBID',
    'Receptor chain',
    'Resolution',
    'Binding site',
    'Ligand CCD',
    'Ligand chain',
    'Ligand serial number',
    'Binding site residues',
    'Binding site residues renumbered',
    'Catalytic site residues',
    'Catalytic site residues renumbered',
    'EC number',
    'GO terms',
    'Binding affinity (manual)',
    'Binding affinity (Binding MOAD)',
    'Binding affinity (PDBbind-CN)',
    'Binding affinity (Binding DB)',
    'UniProt ID',
    'PubMed ID',
    'Ligand residue sequence number',
    'Receptor sequence'
]
binding_cols = [
    'Binding affinity (manual)',
    'Binding affinity (Binding MOAD)',
    'Binding affinity (PDBbind-CN)',
    'Binding affinity (Binding DB)',
]
ions = [
    'MN', 'MG', 'ZN', 'NA', 'CO', 'CA', 'CU', 'NI', 'FE', 
    'HG', 'CE', 'AG', 'CD', 'CL', 'BR', 'F', 'XE', 'KR', 'AR',
    'K', 'LA', 'BA', 'SB', 'TL', 'CS', 'SR', 'AU', 'YB', 'GA', 'CR',
    'PD', 'MO', 'SE', 'LU', 'SM', 'PB', 'EU', 'PT', 'TB', 'RH', 'LI',
    'RB', 'RU', 'DY', 'RE', 'PR', 'OS', 'V', 'IR', 'ND', 'AL'
    'O', 'OH'
]
raw_df = pd.read_csv('BioLiP.txt', sep='\t', names=columns, low_memory=False, keep_default_na=False, na_values=[None, ""])

## Prepare PDBBind-Opt

In [5]:
biolip_pdbbind = raw_df.query('PDBID in @pdbbind_ids')
biolip_pdbbind_dict = {pdbid: subdf for pdbid, subdf in biolip_pdbbind.groupby("PDBID")}

datas = {
    "sm": [],
    "poly": []
}
patt = re.compile(r"([a-zA-Z50]+)([~<>=]+)([\d.eE+-]+)([^\s,]+)")
for pdbid, row in tqdm(list(pdbbind_data.iterrows())):
    # determine if small molecule or polymers
    ligand_ccd = row['Ligand']
    if bool(re.search(r'[^a-zA-Z0-9]', ligand_ccd)):
        category = 'poly'
    else:
        category = 'sm'

    # parse binding data
    binding_string = str(row['Binding Affinity'])
    binding_data = regularize_binding_data(*tuple(re.findall(patt, binding_string))[0])
    binding_data['source'] = 'PDBBind'
    binding_data['origin'] = binding_string
    
    tmp = []
    if not pdbid in biolip_pdbbind_dict:
        tmp.append({"PDBID": pdbid, "Ligand CCD": ligand_ccd, 'Ligand chain': None, 'Ligand residue sequence number': None})
    else:
        biolip_record = biolip_pdbbind_dict[pdbid]
        for chain, subdf in biolip_record.groupby('Ligand chain'):
            if category == 'sm':
                for _, row in subdf.iterrows():
                    if row['Ligand CCD'] == ligand_ccd:
                        tmp.append({
                            "PDBID": pdbid, "Ligand CCD": ligand_ccd, 
                            'Ligand chain': row['Ligand chain'], 
                            'Ligand residue sequence number': row['Ligand residue sequence number'].replace(' ', '')
                        })
                        break # some records have the same ligand in two rows, only record one (because the ligand binds to two protein chains)
            else:
                record = {'PDBID': pdbid, 'Ligand chain': chain, 'Ligand CCD': [], 'Ligand residue sequence number': []}
                for _, row in subdf.iterrows():
                    if row['Ligand CCD'] in ['peptide', 'dna', 'rna']:
                        record.update({'Ligand CCD': row['Ligand CCD'], 'Ligand residue sequence number': row['Ligand residue sequence number']})
                        break
                    if row['Ligand CCD'] in ions:
                        continue
                    if all(str(row[col]) == 'nan' for col in binding_cols):
                        continue
                    record['Ligand CCD'].append(row['Ligand CCD'])
                    record['Ligand residue sequence number'].append(row['Ligand residue sequence number'])
                
                if isinstance(record['Ligand CCD'], list):
                    record['Ligand CCD'] = '-'.join(record['Ligand CCD'])
                if isinstance(record['Ligand residue sequence number'], list):
                    record['Ligand residue sequence number'] = '-'.join(record['Ligand residue sequence number'])
                
                record['Ligand residue sequence number'] = record['Ligand residue sequence number'].replace(' ', '')
                record['Ligand residue sequence number'] = record['Ligand residue sequence number'].replace('~', '-')
                tmp.append(record)

    for t in tmp:
        t.update(binding_data)           
        datas[category].append(t)

100%|██████████| 19443/19443 [00:16<00:00, 1192.89it/s]


In [6]:
for category in datas:
    datas[category] = pd.DataFrame(datas[category])
    datas[category].to_csv(f'PDBBind_{category}.csv', index=None)

## Prepare BioLiP2-Opt

In [7]:
print(f"BioLiP2: {raw_df['PDBID'].unique().shape[0]}")

biolip_sm = raw_df[~raw_df['Ligand CCD'].isin(['peptide', 'dna', 'rna'])]
biolip_poly = raw_df[raw_df['Ligand CCD'].isin(['peptide', 'dna', 'rna'])]
print(f"BioLiP2 with small molecule: {biolip_sm['PDBID'].unique().shape[0]}")

# Find entries with binding data
binding_df = biolip_sm.dropna(
    subset=[
        'Binding affinity (manual)', 'Binding affinity (Binding MOAD)', 'Binding affinity (PDBbind-CN)', 'Binding affinity (Binding DB)'
    ],
    how='all'
)
biolip_ids = binding_df["PDBID"].unique()
print(f"BioLiP2 with binding:", len(biolip_ids))

all_ids = set(biolip_ids).union(pdbbind_ids)
print('Total PDBIDs:', len(all_ids))

BioLiP2: 143054
BioLiP2 with small molecule: 128638
BioLiP2 with binding: 23606
Total PDBIDs: 26542


In [8]:
ions = [
    'MN', 'MG', 'ZN', 'NA', 'CO', 'CA', 'CU', 'NI', 'FE', 
    'HG', 'CE', 'AG', 'CD', 'CL', 'BR', 'F', 'XE', 'KR', 'AR',
    'K', 'LA', 'BA', 'SB', 'TL', 'CS', 'SR', 'AU', 'YB', 'GA', 'CR',
    'PD', 'MO', 'SE', 'LU', 'SM', 'PB', 'EU', 'PT', 'TB', 'RH', 'LI',
    'RB', 'RU', 'DY', 'RE', 'PR', 'OS', 'V', 'IR', 'ND', 'AL'
    'O', 'OH'
]
polymers = ['peptide', 'dna', 'rna']
select_df = raw_df.query('(PDBID in @biolip_ids) & (PDBID not in @pdbbind_ids) & (`Ligand CCD` not in @polymers) & (`Ligand CCD` not in @ions)')

# Select entries with only one CCD
one_sm_filter = []
for pdbid, subdf in select_df.groupby('PDBID'):
    if len(subdf['Ligand CCD'].unique()) == 1:
        one_sm_filter.append(pdbid)

select_df = select_df.query('PDBID in @one_sm_filter')

# Select entries with num_heavy_atoms >= 6
num_heavy_atoms_filter = []
for pdbid, subdf in tqdm(list(select_df.groupby('PDBID'))):
    ccds = subdf["Ligand CCD"].unique()
    if len(ccds) > 1:
        continue
    smi = get_smiles_from_rcsb(ccds[0])
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        continue
    num_heavy_atoms = mol.GetNumHeavyAtoms()
    if num_heavy_atoms < 6:
        continue
    num_heavy_atoms_filter.append(pdbid)
select_df = select_df.query('PDBID in @num_heavy_atoms_filter')

  9%|▉         | 368/3937 [00:15<02:28, 24.04it/s][18:37:17] Explicit valence for atom # 0 B, 4, is greater than permitted
 18%|█▊        | 704/3937 [00:29<02:14, 23.97it/s][18:37:31] Explicit valence for atom # 2 C, 5, is greater than permitted
 23%|██▎       | 911/3937 [00:38<02:05, 24.16it/s][18:37:40] Explicit valence for atom # 0 B, 4, is greater than permitted
 27%|██▋       | 1058/3937 [00:44<01:59, 24.06it/s][18:37:46] Explicit valence for atom # 13 Ga, 7, is greater than permitted
 41%|████      | 1598/3937 [01:07<01:37, 24.04it/s][18:38:09] Explicit valence for atom # 0 B, 5, is greater than permitted
[18:38:09] Explicit valence for atom # 0 B, 5, is greater than permitted
[18:38:09] Explicit valence for atom # 0 B, 4, is greater than permitted
 49%|████▉     | 1937/3937 [01:21<01:24, 23.58it/s][18:38:23] Explicit valence for atom # 2 C, 5, is greater than permitted
 51%|█████▏    | 2018/3937 [01:24<01:22, 23.37it/s][18:38:27] Explicit valence for atom # 0 B, 5, is greater th

In [9]:
def process_binding_data(row):
    cols = [
        'Binding affinity (Binding MOAD)',
        'Binding affinity (manual)',
        'Binding affinity (PDBbind-CN)',
        'Binding affinity (Binding DB)',
    ]
    sources = ['MOAD', 'BioLiP', "PDBBind", "BindingDB"]
    type_rank = {
        "ic50": 0,
        "ki": 1,
        "kd": 2,
    }
    patt = re.compile(r"([a-zA-Z50]+)([~<>=]+)([\d.eE+-]+)([^\s,]+)")
    binding_data = None
    for col, src in zip(cols, sources):
        string = str(row[col])
        if string == 'nan':
            continue
        for match in re.findall(patt, string):
            tmp = regularize_binding_data(*match)
            if tmp['measurement'] not in ['kd', 'ki', 'ic50']:
                continue
            if binding_data is None:
                binding_data = tmp
            elif (type_rank[tmp['measurement']] > type_rank[binding_data['measurement']]):
                binding_data = tmp
        if binding_data:
            binding_data['source'] = src
            binding_data['origin'] = string
            break 
    return binding_data


# filter out only ec50
biolip_binding_data = {}
for pdbid, subdf in select_df.groupby('PDBID'):
    for _, row in subdf.iterrows():
        binding_data = process_binding_data(row)
        if binding_data:
            break
    if binding_data:
        biolip_binding_data[pdbid] = binding_data

pdbid_with_binding = list(biolip_binding_data.keys())
select_df = select_df.query('PDBID in @pdbid_with_binding')

In [19]:
non_redundant = {}
for index, row in select_df.iterrows():
    sig = (row['PDBID'], row["Ligand chain"], row['Ligand CCD'], row['Ligand residue sequence number'])
    if sig not in non_redundant:
        non_redundant[sig] = index

biolip_df = select_df.loc[non_redundant.values(), ['PDBID', 'Ligand chain', 'Ligand CCD', 'Ligand residue sequence number']]
records = []
for record in biolip_df.to_dict('records'):
    record.update(biolip_binding_data[record['PDBID']])
    records.append(record)
biolip_df = pd.DataFrame(records)

# BioLiP has some non-valid binding data
biolip_df = biolip_df.query('logvalue < 3')
biolip_df['Ligand residue sequence number'] = biolip_df['Ligand residue sequence number'].apply(lambda x: x.rstrip())
biolip_df.to_csv('BioLiP_bind_sm.csv', index=None)