# 02. BindingDB Preprocessing

In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

Download Nov 2024's version of BindingDB and unzip it: https://www.bindingdb.org/bind/downloads/BindingDB_All_202411_tsv.zip

In [2]:
!mkdir -p ../data
!wget https://www.bindingdb.org/bind/downloads/BindingDB_All_202411_tsv.zip -O ../data/BindingDB_All_202411_tsv.zip
!unzip -o ../data/BindingDB_All_202411_tsv.zip -d ../data

--2024-11-12 15:55:31--  https://www.bindingdb.org/bind/downloads/BindingDB_All_202411_tsv.zip
Resolving www.bindingdb.org (www.bindingdb.org)... 137.110.139.247
Connecting to www.bindingdb.org (www.bindingdb.org)|137.110.139.247|:443... connected.
HTTP request sent, awaiting response... 200 200
Length: 495751730 (473M) [application/zip]
Saving to: ‘../data/BindingDB_All_202411_tsv.zip’


2024-11-12 15:57:51 (3.38 MB/s) - ‘../data/BindingDB_All_202411_tsv.zip’ saved [495751730/495751730]

Archive:  ../data/BindingDB_All_202411_tsv.zip
  inflating: ../data/BindingDB_All.tsv  


In [3]:
DATA_DIR = Path('../data')
PATH_TO_BDB = DATA_DIR / 'BindingDB_All.tsv'

In [4]:
bdb = pd.read_csv(PATH_TO_BDB, sep='\t', on_bad_lines='skip', low_memory=False, usecols=[
    'Ligand SMILES',
    'Ki (nM)',
    'IC50 (nM)',
    'Kd (nM)',
    'EC50 (nM)',
    'UniProt (SwissProt) Primary ID of Target Chain',
    'UniProt (TrEMBL) Primary ID of Target Chain',
    'BindingDB Target Chain Sequence',
    'Number of Protein Chains in Target (>1 implies a multichain complex)',
])

In [5]:
bdb = bdb[bdb['Number of Protein Chains in Target (>1 implies a multichain complex)'] == 1]

In [6]:
bdb = bdb.rename({
    'Ligand SMILES': 'smiles',
    'Ki (nM)' : 'ki',
    'IC50 (nM)': 'ic50',
    'Kd (nM)': 'kd',
    'EC50 (nM)': 'ec50',
    'UniProt (SwissProt) Primary ID of Target Chain': 'uniprot_swissprot',
    'UniProt (TrEMBL) Primary ID of Target Chain': 'uniprot_trembl',
    'BindingDB Target Chain Sequence': 'sequence',
},  axis=1) 
bdb

Unnamed: 0,smiles,ki,ic50,kd,ec50,Number of Protein Chains in Target (>1 implies a multichain complex),sequence,uniprot_swissprot,uniprot_trembl
0,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,0.24,,,,1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,P03367,
1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,0.25,,,,1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,P03367,
2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,0.41,,,,1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,P03367,
3,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,0.8,,,,1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,P03367,
4,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,0.99,,,,1,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,P03367,
...,...,...,...,...,...,...,...,...,...
2927604,CC[C@H]1CN(Cc2cc(C)cc(CC(O)=O)c2)CCN1c1nc2ccc(...,,,,670,1,MGETLGDSPVDPEHGAFADALPMSTSQEITMVDTEMPFWPTNFGIS...,P37238,
2927605,CC[C@@H]1CN(Cc2cc(C)cc(CC(O)=O)c2)CCN1c1nc2ccc...,,,,40,1,MGETLGDSPVDPEHGAFADALPMSTSQEITMVDTEMPFWPTNFGIS...,P37238,
2927606,CC(C)[C@@H]1CN(Cc2cc(C)cc(CC(O)=O)c2)CCN1c1nc2...,,,,230,1,MGETLGDSPVDPEHGAFADALPMSTSQEITMVDTEMPFWPTNFGIS...,P37238,
2927607,COc1ccc(cc1)N(C)c1nc(C)nc2[nH]ccc12,,2600,,,1,CVSASPSTLARLVSRSAMPAGSSTAWNTAFSPMARCQVTKTIGGGD...,,Q862L2


In [7]:
def extract_sign(val):
    if isinstance(val, float):
        return None  # it's nan
    val = str(val)
    if not val:
        return None
    if val[0] in ["<", ">"]:
        return val[0]
    else:
        return '='

def extract_val(val):
    if isinstance(val, float):
        return None  # it's nan
    val = str(val)
    if not val:
        return None
    if val[0] in ["<", ">"]:
        return float(val[1:])
    else:
        return float(val)

In [8]:
bdb['ki_sign'] = bdb['ki'].map(extract_sign)
bdb['ki'] = bdb['ki'].map(extract_val)
bdb['ic50_sign'] = bdb['ic50'].map(extract_sign)
bdb['ic50'] = bdb['ic50'].map(extract_val)
bdb['kd_sign'] = bdb['kd'].map(extract_sign)
bdb['kd'] = bdb['kd'].map(extract_val)
bdb['ec50_sign'] = bdb['ec50'].map(extract_sign)
bdb['ec50'] = bdb['ec50'].map(extract_val)

In [9]:
def get_uniprot_id(row):
    if pd.notna(row['uniprot_swissprot']):
        return row['uniprot_swissprot']
    else:
        return row['uniprot_trembl']

In [10]:
bdb = bdb.dropna(subset=['uniprot_swissprot', 'uniprot_trembl'], how='all').reset_index(drop=True)
bdb['uniprot_id'] = bdb.apply(get_uniprot_id, axis=1)
bdb = bdb.drop(labels=[
    'uniprot_swissprot',
    'uniprot_trembl',
    'Number of Protein Chains in Target (>1 implies a multichain complex)'
], axis=1).reset_index(drop=True)
bdb['source'] = 'bdb'
bdb

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source
0,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
3,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
4,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
...,...,...,...,...,...,...,...,...,...,...,...,...
2759438,CC[C@H]1CN(Cc2cc(C)cc(CC(O)=O)c2)CCN1c1nc2ccc(...,,,,670.0,MGETLGDSPVDPEHGAFADALPMSTSQEITMVDTEMPFWPTNFGIS...,,,,=,P37238,bdb
2759439,CC[C@@H]1CN(Cc2cc(C)cc(CC(O)=O)c2)CCN1c1nc2ccc...,,,,40.0,MGETLGDSPVDPEHGAFADALPMSTSQEITMVDTEMPFWPTNFGIS...,,,,=,P37238,bdb
2759440,CC(C)[C@@H]1CN(Cc2cc(C)cc(CC(O)=O)c2)CCN1c1nc2...,,,,230.0,MGETLGDSPVDPEHGAFADALPMSTSQEITMVDTEMPFWPTNFGIS...,,,,=,P37238,bdb
2759441,COc1ccc(cc1)N(C)c1nc(C)nc2[nH]ccc12,,2600.0,,,CVSASPSTLARLVSRSAMPAGSSTAWNTAFSPMARCQVTKTIGGGD...,,=,,,Q862L2,bdb


In [11]:
bdb.to_csv(DATA_DIR / 'bdb.csv', index=False)