In [3]:
! wget https://zhanggroup.org/BioLiP/download/BioLiP.txt.gz

--2024-09-05 12:11:06--  https://zhanggroup.org/BioLiP/download/BioLiP.txt.gz
Resolving zhanggroup.org (zhanggroup.org)... 141.213.137.249
Connecting to zhanggroup.org (zhanggroup.org)|141.213.137.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71064745 (68M) [application/x-gzip]
Saving to: ‘BioLiP.txt.gz.1’


2024-09-05 12:11:08 (43.4 MB/s) - ‘BioLiP.txt.gz.1’ saved [71064745/71064745]



In [4]:
! gunzip BioLiP.txt.gz

gzip: BioLiP.txt already exists; do you wish to overwrite (y or n)? 

In [15]:
import pandas as pd
from tqdm import tqdm
import re

# Compare with PDBBind
def parse_pdbbind_metadata(index='../raw/index/INDEX_general_PL.2020'):
    data = []
    with open(index) as f:
        for line in f:
            if line.startswith('#'):
                continue
            if line:
                content = line.strip().split()
                if not content[6].endswith(')'):
                    ligand = content[6][1:]
                else:
                    ligand = content[6][1:-1]
                    
                data.append({
                    "PDBID": content[0],
                    "Resolution": content[1],
                    "Year": content[2],
                    "Binding Affinity": content[3],
                    "Ligand": ligand,
                    "Note": ' '.join(content[7:])
                })
    data = pd.DataFrame(data)
    return data

pdbbind_data = parse_pdbbind_metadata('../raw_data_pdbbind/index/INDEX_general_PL.2020')
pdbbind_ids = pdbbind_data['PDBID'].unique()
print("PDBBind v2020:", len(pdbbind_ids))


columns = [
    'PDBID',
    'Receptor chain',
    'Resolution',
    'Binding site',
    'Ligand CCD',
    'Ligand chain',
    'Ligand serial number',
    'Binding site residues',
    'Binding site residues renumbered',
    'Catalytic site residues',
    'Catalytic site residues renumbered',
    'EC number',
    'GO terms',
    'Binding affinity (manual)',
    'Binding affinity (Binding MOAD)',
    'Binding affinity (PDBbind-CN)',
    'Binding affinity (Binding DB)',
    'UniProt ID',
    'PubMed ID',
    'Ligand residue sequence number',
    'Receptor sequence'
]
raw_df = pd.read_csv('BioLiP.txt', sep='\t', names=columns, low_memory=False, keep_default_na=False, na_values=[None, ""])
print(f"BioLiP2: {raw_df['PDBID'].unique().shape[0]}")

biolip_sm = raw_df[~raw_df['Ligand CCD'].isin(['peptide', 'dna', 'rna'])]
biolip_poly = raw_df[raw_df['Ligand CCD'].isin(['peptide', 'dna', 'rna'])]
print(f"BioLiP2 with small molecule: {biolip_sm['PDBID'].unique().shape[0]}")

# Find entries with binding data
binding_df = biolip_sm.dropna(
    subset=[
        'Binding affinity (manual)', 'Binding affinity (Binding MOAD)', 'Binding affinity (PDBbind-CN)', 'Binding affinity (Binding DB)'
    ],
    how='all'
)
biolip_ids = binding_df["PDBID"].unique()
print(f"BioLiP2 with binding:", len(biolip_ids))

all_ids = set(biolip_ids).union(pdbbind_ids)
print('Total PDBIDs:', len(all_ids))


# # Exclude covalent binders
# cov_df = pd.read_csv('CovBinderInPDB_2022Q4_AllRecords.csv')
# cov = [c.lower() for c in cov_df['pdb_id'].unique()]

# binding_df = binding_df.query('PDBID not in @cov')
# print(f'BioLiP2 with binding & non-covalent: {binding_df["PDBID"].unique().shape[0]}')

# binding_df_noncov_all = set(binding_df["PDBID"].unique())

# # get rid of ions
# ions = [
#     'MN', 'MG', 'ZN', 'NA', 'CO', 'CA', 'CU', 'NI', 'FE', 
#     'HG', 'CE', 'AG', 'CD', 'CL', 'BR', 'F', 'XE', 'KR', 'AR',
#     'K', 'LA', 'BA', 'SB', 'TL', 'CS', 'SR', 'AU', 'YB', 'GA', 'CR',
#     'PD', 'MO', 'SE', 'LU', 'SM', 'PB', 'EU', 'PT', 'TB', 'RH', 'LI',
#     'RB', 'RU', 'DY', 'RE', 'PR', 'OS', 'V', 'IR', 'ND', 'AL'
#     'O', 'OH'
# ]
# binding_df = binding_df[[ccd not in ions for ccd in binding_df['Ligand CCD']]]

# # Exclude DNA/RNA/Peptide binders
# non_sm = ['rna', 'dna', 'peptide']
# non_sm_mask = [ccd not in non_sm for ccd in binding_df['Ligand CCD']]
# binding_df = binding_df[non_sm_mask]

# # Exclude polymers
# polymers = []
# for pdb_id, subdf in binding_df.groupby('PDBID'):
#     ligands = subdf['Ligand CCD'].unique()
#     if len(ligands) > 1:
#         polymers.append(pdb_id)
# binding_df = binding_df.query('PDBID not in @polymers')

# print(f'BioLiP2 with binding & non-covalent & small molecules: {binding_df["PDBID"].unique().shape[0]}')

# args = []
# for pdb_id, subdf in binding_df.groupby('PDBID'):
#     args.append((pdb_id, subdf['Ligand CCD'].unique()[0]))

# metadata = pd.DataFrame(args, columns=['PDBID', 'Ligand CCD'])
# metadata.to_csv('./BioLiP_bind_sm.csv', index=None)

PDBBind v2020: 19443
BioLiP2: 143054
BioLiP2 with small molecule: 128638
BioLiP2 with binding: 23606
Total PDBIDs: 26542


In [20]:
ions = [
    'MN', 'MG', 'ZN', 'NA', 'CO', 'CA', 'CU', 'NI', 'FE', 
    'HG', 'CE', 'AG', 'CD', 'CL', 'BR', 'F', 'XE', 'KR', 'AR',
    'K', 'LA', 'BA', 'SB', 'TL', 'CS', 'SR', 'AU', 'YB', 'GA', 'CR',
    'PD', 'MO', 'SE', 'LU', 'SM', 'PB', 'EU', 'PT', 'TB', 'RH', 'LI',
    'RB', 'RU', 'DY', 'RE', 'PR', 'OS', 'V', 'IR', 'ND', 'AL'
    'O', 'OH'
]
polymers = ['peptide', 'dna', 'rna']

select_pdbids = []
select_df = raw_df.query('(PDBID in @biolip_ids) & (PDBID not in @pdbbind_ids) & (`Ligand CCD` not in @polymers) & (`Ligand CCD` not in @ions)')
for pdbid, subdf in select_df.groupby('PDBID'):
    select = True
    for chain, subsubdf in subdf.groupby('Ligand chain'):
        if subsubdf.shape[0] > 1:
            select = False
            break
    if select:
        select_pdbids.append(pdbid)
print(len(select_pdbids))
select_df.query('PDBID in @select_pdbids')[['PDBID', 'Ligand CCD', 'Ligand chain', 'Ligand residue sequence number']].to_csv('BioLiP_bind_sm.csv', index=None)

3546


In [66]:
columns_subset = [
    'PDBID',
    'Ligand chain',
    'Ligand CCD',
    'Ligand residue sequence number',
    'Binding affinity (manual)',
    'Binding affinity (Binding MOAD)',
    'Binding affinity (PDBbind-CN)',
    'Binding affinity (Binding DB)'
]
biolip_data_sm_dict = {pdbid: subdf for pdbid, subdf in tqdm(biolip_sm[columns_subset].groupby('PDBID'))}
biolip_data_dict = {pdbid: subdf for pdbid, subdf in tqdm(raw_df[columns_subset].groupby('PDBID'))}
pdbbind_data_dict = pdbbind_data.set_index('PDBID').to_dict('index')

100%|██████████| 128638/128638 [00:03<00:00, 39919.64it/s]
100%|██████████| 143054/143054 [00:03<00:00, 36685.81it/s]


In [72]:
ions = [
    'MN', 'MG', 'ZN', 'NA', 'CO', 'CA', 'CU', 'NI', 'FE', 
    'HG', 'CE', 'AG', 'CD', 'CL', 'BR', 'F', 'XE', 'KR', 'AR',
    'K', 'LA', 'BA', 'SB', 'TL', 'CS', 'SR', 'AU', 'YB', 'GA', 'CR',
    'PD', 'MO', 'SE', 'LU', 'SM', 'PB', 'EU', 'PT', 'TB', 'RH', 'LI',
    'RB', 'RU', 'DY', 'RE', 'PR', 'OS', 'V', 'IR', 'ND', 'AL'
    'O', 'OH'
]
binding_cols = ['Binding affinity (manual)', 'Binding affinity (Binding MOAD)', 'Binding affinity (PDBbind-CN)', 'Binding affinity (Binding DB)']

data_sm = []
data_poly = []
for pdbid in pdbbind_data_dict:
    ligand_ccd = pdbbind_data_dict[pdbid]['Ligand']
    if not bool(re.search(r'[^a-zA-Z0-9]', ligand_ccd)):
        if pdbid not in biolip_data_sm_dict:
            data_sm.append({'PDBID': pdbid, 'Ligand CCD': ligand_ccd, 'Ligand chain': None, 'Ligand residue sequence number': None})
        else:
            found = []
            for _, row in biolip_data_sm_dict[pdbid].iterrows():
                if row['Ligand CCD'] == ligand_ccd:
                    chain, resnum = row['Ligand chain'], row['Ligand residue sequence number']
                    if (chain, resnum) in found:
                        continue
                    data_sm.append({'PDBID': pdbid, 'Ligand CCD': ligand_ccd, 'Ligand chain': chain, 'Ligand residue sequence number': resnum.strip()})
                    found.append((chain, resnum))
    else:
        if pdbid not in biolip_data_dict:
            data_poly.append({'PDBID': pdbid, 'Ligand CCD': ligand_ccd, 'Ligand chain': None, 'Ligand residue sequence number': None})
        else:
            for chain, subdf in biolip_data_dict[pdbid].groupby('Ligand chain'):
                record = {'PDBID': pdbid, 'Ligand chain': chain, 'Ligand CCD': [], 'Ligand residue sequence number': []}
                for _, row in subdf.iterrows():
                    if row['Ligand CCD'] in ['peptide', 'dna', 'rna']:
                        record.update({'Ligand CCD': row['Ligand CCD'], 'Ligand residue sequence number': row['Ligand residue sequence number']})
                        break
                    if row['Ligand CCD'] in ions:
                        continue
                    if all(str(row[col]) == 'nan' for col in binding_cols):
                        continue
                    record['Ligand CCD'].append(row['Ligand CCD'])
                    record['Ligand residue sequence number'].append(row['Ligand residue sequence number'])
                
                if isinstance(record['Ligand CCD'], list):
                    record['Ligand CCD'] = '-'.join(record['Ligand CCD'])
                if isinstance(record['Ligand residue sequence number'], list):
                    record['Ligand residue sequence number'] = '-'.join(record['Ligand residue sequence number'])
                
                record['Ligand residue sequence number'] = record['Ligand residue sequence number'].replace(' ', '')
                record['Ligand residue sequence number'] = record['Ligand residue sequence number'].replace('~', '-')
                data_poly.append(record)


df_sm = pd.DataFrame(data_sm)
df_sm = df_sm.sort_values(by=['PDBID', 'Ligand chain', 'Ligand residue sequence number'])
df_sm.to_csv('PDBBind_sm.csv', index=None)

df_poly = pd.DataFrame(data_poly)
df_poly = df_poly.sort_values(by=['PDBID', 'Ligand chain', 'Ligand residue sequence number'])
df_poly.to_csv('PDBBind_poly.csv', index=None)

In [70]:
isinstance(np.nan, str)

False