In [3]:
! wget https://zhanggroup.org/BioLiP/download/BioLiP.txt.gz

--2024-09-05 12:11:06--  https://zhanggroup.org/BioLiP/download/BioLiP.txt.gz
Resolving zhanggroup.org (zhanggroup.org)... 141.213.137.249
Connecting to zhanggroup.org (zhanggroup.org)|141.213.137.249|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 71064745 (68M) [application/x-gzip]
Saving to: ‘BioLiP.txt.gz.1’


2024-09-05 12:11:08 (43.4 MB/s) - ‘BioLiP.txt.gz.1’ saved [71064745/71064745]



In [4]:
! gunzip BioLiP.txt.gz

gzip: BioLiP.txt already exists; do you wish to overwrite (y or n)? 

In [1]:
import pandas as pd
from tqdm import tqdm

columns = [
    'PDBID',
    'Receptor chain',
    'Resolution',
    'Binding site',
    'Ligand CCD',
    'Ligand chain',
    'Ligand serial number',
    'Binding site residues',
    'Binding site residues renumbered',
    'Catalytic site residues',
    'Catalytic site residues renumbered',
    'EC number',
    'GO terms',
    'Binding affinity (manual)',
    'Binding affinity (Binding MOAD)',
    'Binding affinity (PDBbind-CN)',
    'Binding affinity (Binding DB)',
    'UniProt ID',
    'PubMed ID',
    'Ligand residue sequence number',
    'Receptor sequence'
]
raw_df = pd.read_csv('BioLiP.txt', sep='\t', names=columns, low_memory=False, keep_default_na=False, na_values=[None, ""])
print(f"BioLiP2: {raw_df['PDBID'].unique().shape[0]}")

# Find entries with binding data
binding_df = raw_df.dropna(
    subset=[
        'Binding affinity (manual)', 'Binding affinity (Binding MOAD)', 'Binding affinity (PDBbind-CN)', 'Binding affinity (Binding DB)'
    ],
    how='all'
)
print(f'BioLiP2 with binding: {binding_df["PDBID"].unique().shape[0]}')

# Exclude covalent binders
cov_df = pd.read_csv('CovBinderInPDB_2022Q4_AllRecords.csv')
cov = [c.lower() for c in cov_df['pdb_id'].unique()]

binding_df = binding_df.query('PDBID not in @cov')
print(f'BioLiP2 with binding & non-covalent: {binding_df["PDBID"].unique().shape[0]}')

# get rid of ions
ions = [
    'MN', 'MG', 'ZN', 'NA', 'CO', 'CA', 'CU', 'NI', 'FE', 
    'HG', 'CE', 'AG', 'CD', 'CL', 'BR', 'F', 'XE', 'KR', 'AR',
    'K', 'LA', 'BA', 'SB', 'TL', 'CS', 'SR', 'AU', 'YB', 'GA', 'CR',
    'PD', 'MO', 'SE', 'LU', 'SM', 'PB', 'EU', 'PT', 'TB', 'RH', 'LI',
    'RB', 'RU', 'DY', 'RE', 'PR', 'OS', 'V', 'IR', 'ND', 'AL'
    'O', 'OH'
]
binding_df = binding_df[[ccd not in ions for ccd in binding_df['Ligand CCD']]]

# Exclude DNA/RNA/Peptide binders
non_sm = ['rna', 'dna', 'peptide']
non_sm_mask = [ccd not in non_sm for ccd in binding_df['Ligand CCD']]
binding_df = binding_df[non_sm_mask]

# Exclude polymers
polymers = []
for pdb_id, subdf in binding_df.groupby('PDBID'):
    ligands = subdf['Ligand CCD'].unique()
    if len(ligands) > 1:
        polymers.append(pdb_id)
binding_df = binding_df.query('PDBID not in @polymers')

print(f'BioLiP2 with binding & non-covalent & small molecules: {binding_df["PDBID"].unique().shape[0]}')

args = []
for pdb_id, subdf in binding_df.groupby('PDBID'):
    args.append((pdb_id, subdf['Ligand CCD'].unique()[0]))

metadata = pd.DataFrame(args, columns=['PDBID', 'Ligand CCD'])
metadata.to_csv('./BioLiP_bind_sm.csv', index=None)

BioLiP2: 143054
BioLiP2 with binding: 24635
BioLiP2 with binding & non-covalent: 23723
BioLiP2 with binding & non-covalent & small molecules: 22272


In [None]:
import numpy as np
import sys
sys.path.insert(0, '../utils')
from rcsb import get_smiles_from_rcsb
from tqdm import tqdm


for ccd in tqdm(raw_df['Ligand CCD'].unique()):
    if len(ccd) <= 2:
        smi = get_smiles_from_rcsb(ccd)

  0%|          | 0/39426 [00:00<?, ?it/s]

  3%|▎         | 1318/39426 [00:54<26:00, 24.41it/s]