# Scripts for preparing metatdata for PDBBind-Opt and BioLiP2-Opt Dataset

In this script, we will create CSV files containing relevant metadata that will be used for the PDBBind-Opt workflow to process structures.

In [63]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
import requests
import math

from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import RemoveHs
from rdkit.Chem import Descriptors
from rdkit.Chem import rdFingerprintGenerator
from tqdm.contrib.concurrent import process_map

import os
os.chdir('/data02/venus/AlloBind/PDBBind-Opt/pre_process')

In [17]:
def get_smiles_from_rcsb(comp_id: str):
    """
    Query ligand SMILES from RCSB

    Parameters
    ----------
    comp_id: str
        The ligand ID, usually a three-letter code
    
    Returns
    -------
    smi: str
        The SMILES of the query ligand. If fail to get, will return a vacant string
    """
    query = '''{chem_comp(comp_id: "%s") {
        rcsb_chem_comp_descriptor {
        SMILES_stereo SMILES InChI
        }
    }
    }''' % comp_id
    query = re.sub(r'\s+', ' ', query)
    try:
        res = requests.get('https://data.rcsb.org/graphql?query=' + query)
        smi = res.json()['data']['chem_comp']['rcsb_chem_comp_descriptor']['SMILES_stereo']
        if smi is None:
            smi = res.json()['data']['chem_comp']['rcsb_chem_comp_descriptor']['SMILES']
        if smi is None:
            m = Chem.MolFromInchi(res.json()['data']['chem_comp']['rcsb_chem_comp_descriptor']['InChI'])
            smi = Chem.MolToSmiles(m)
        assert smi is not None, "No reference smiles"
        return smi
    except:
        return ""


def remove_all_hs(mol):
    params = Chem.RemoveHsParameters()
    params.removeAndTrackIsotopes = True
    params.removeDefiningBondStereo = True
    params.removeDegreeZero = True
    params.removeDummyNeighbors = True
    params.removeHigherDegrees = True
    params.removeHydrides = True
    params.removeInSGroups = True
    params.removeIsotopes = True
    params.removeMapped = True
    params.removeNonimplicit = True
    params.removeOnlyHNeighbors = True
    params.removeWithQuery = True
    params.removeWithWedgedBond = True
    return RemoveHs(mol, params)

# Pre-process

## step1 & 2: filter colvalent and regular ligands

In [27]:
inter_columns = [
    'allosteric_pdb',
    'is_covalent',
    'modulator_alias',
    'modulator_chain',
    'modulator_class',
    'modulator_feature',
    'modulator_resi',
]

asd2023_df = pd.read_excel('ASD_PL_2023.xlsx', usecols=inter_columns)
# modulator_class "compound" is filtered out for the coordination between allo and ortho ligands
asd2023_filter_df = asd2023_df[(asd2023_df['is_covalent'].str.lower() != 'yes') & (asd2023_df['modulator_class'].str.contains('Lig'))]
asd2023_filter_df.to_excel('ASD_PL_2023_filter.xlsx', index=False, columns=['allosteric_pdb', 'modulator_chain', 'modulator_alias', 'modulator_resi'])

## step3: visualize check

## step4: strip the duplicates

In [15]:
asd2023_df = pd.read_excel('ASD_PL_2023_filter_r3_2661.xlsx')
print(f"duplicated: {len(asd2023_df)}")

asd2023_df_unique = asd2023_df.drop_duplicates(subset=['allosteric_pdb'], keep='first')
print(f"unique: {len(asd2023_df_unique)}")
asd2023_df_unique.to_excel(f'ASD_PL_2023_filter_r4_{len(asd2023_df_unique)}.xlsx', index=False)

duplicated: 2661
unique: 2557


## step5: cross-check with Q-BioLiP

### Parse BioLiP

In [81]:
columns = [
    'PDBID',
    'Receptor chain',
    'Resolution',
    'Binding site',
    'Ligand CCD',
    'Ligand chain',
    'Ligand serial number',
    'Binding site residues',
    'Binding site residues renumbered',
    'Catalytic site residues',
    'Catalytic site residues renumbered',
    'EC number',
    'GO terms',
    'Binding affinity (manual)',
    'Binding affinity (Binding MOAD)',
    'Binding affinity (PDBbind-CN)',
    'Binding affinity (Binding DB)',
    'UniProt ID',
    'PubMed ID',
    'Ligand residue sequence number',
    'Receptor sequence'
]
binding_cols = [
    'Binding affinity (manual)',
    'Binding affinity (Binding MOAD)',
    'Binding affinity (PDBbind-CN)',
    'Binding affinity (Binding DB)',
]
ions = [
    'MN', 'MG', 'ZN', 'NA', 'CO', 'CA', 'CU', 'NI', 'FE', 
    'HG', 'CE', 'AG', 'CD', 'CL', 'BR', 'F', 'XE', 'KR', 'AR',
    'K', 'LA', 'BA', 'SB', 'TL', 'CS', 'SR', 'AU', 'YB', 'GA', 'CR',
    'PD', 'MO', 'SE', 'LU', 'SM', 'PB', 'EU', 'PT', 'TB', 'RH', 'LI',
    'RB', 'RU', 'DY', 'RE', 'PR', 'OS', 'V', 'IR', 'ND', 'AL'
    'O', 'OH'
]
raw_df = pd.read_csv('BioLiP.txt', sep='\t', names=columns, low_memory=False, keep_default_na=False, na_values=[None, ""])
raw_df['PDBID'] = raw_df['PDBID'].str.upper()
raw_df['Ligand residue sequence number'] = raw_df['Ligand residue sequence number'].astype(str).str.strip()
raw_df.head()

Unnamed: 0,PDBID,Receptor chain,Resolution,Binding site,Ligand CCD,Ligand chain,Ligand serial number,Binding site residues,Binding site residues renumbered,Catalytic site residues,...,EC number,GO terms,Binding affinity (manual),Binding affinity (Binding MOAD),Binding affinity (PDBbind-CN),Binding affinity (Binding DB),UniProt ID,PubMed ID,Ligand residue sequence number,Receptor sequence
0,101M,A,2.07,BS01,HEM,A,1,F43 R45 V68 S92 H93 H97 I99 Y103,F44 R46 V69 S93 H94 H98 I100 Y104,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
1,102M,A,1.84,BS01,HEM,A,1,F43 R45 T67 L89 S92 H93 H97 I99 Y103,F44 R46 T68 L90 S93 H94 H98 I100 Y104,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
2,103M,A,2.07,BS01,HEM,A,1,F43 R45 S92 H93 H97 I99 Y103,F44 R46 S93 H94 H98 I100 Y104,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,MVLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDR...
3,104M,A,1.71,BS01,HEM,A,1,F43 R45 V68 S92 H93 H97 I99 Y103 F138,F43 R45 V68 S92 H93 H97 I99 Y103 F138,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...
4,105M,A,2.02,BS01,HEM,A,1,F43 R45 H64 V68 L89 H93 H97 I99,F43 R45 H64 V68 L89 H93 H97 I99,,...,"1.11.1.-,1.7.-.-","0004601,0005344,0005737,0015671,0016491,001652...",,,,,P02185,,155,VLSEGEWQLVLHVWAKVEADVAGHGQDILIRLFKSHPETLEKFDRF...


### Parse Q-BioLiP

In [58]:
raw_df = pd.read_csv('Q-BioLiP_relevant.csv')
raw_df['PDB ID'] = raw_df['PDB ID'].str.upper()

  raw_df = pd.read_csv('Q-BioLiP_relevant.csv')


### Parse Allosteric

In [93]:
allo_df = pd.read_excel('./allo_pro_lig_filter_pre/ASD_PL_2023_filter_r5_2516.xlsx')
allo_df['modulator_resi'] = allo_df['modulator_resi'].fillna(0).astype(int).astype(str).str.strip()
allo_df

Unnamed: 0,allosteric_pdb,modulator_chain,modulator_alias,modulator_resi
0,2XCG,A,XCG,602
1,2XFN,A,XCG,1501
2,2XFO,A,XCG,1504
3,2XFP,A,XCG,602
4,2XFQ,A,XCG,602
...,...,...,...,...
2829,7OUL,A,1KE,1104
2830,8TGD,A,ZWE,1512
2831,8DSD,A,TIW,1003
2832,8PVP,A,FWU,401


### Map Allo to BioLiP to correct the Allo wrong recording

In [94]:
df = pd.merge(
    allo_df,
    raw_df,
    how='left', 
    left_on=['allosteric_pdb', 'modulator_chain', 'modulator_alias', 'modulator_resi'],
    right_on=['PDBID', 'Ligand chain', 'Ligand CCD', 'Ligand residue sequence number'],
    indicator=True
)

cols = ['allosteric_pdb', 'modulator_chain', 'modulator_alias', 'modulator_resi']

# both = df[df['_merge'] == 'both']
# both[cols].to_excel('./allo_pro_lig_filter_pre/r3_allo_biolip_both.xlsx', index=False)

unmatched = df[df['_merge'] == 'left_only']
unmatched_df = unmatched[cols]
print(len(unmatched_df))

df2 = pd.merge(
    unmatched_df,
    raw_df,
    how='left', 
    left_on=['allosteric_pdb', 'modulator_alias'],
    right_on=['PDBID', 'Ligand CCD'],
    indicator=True
)

unmatched2 = df2[df2['_merge'] == 'left_only']

save_cols = ['allosteric_pdb', 'modulator_chain', 'modulator_alias', 'modulator_resi', 'PDBID', 'Ligand chain', 'Ligand CCD', 'Ligand residue sequence number']
matched_pdbs = unmatched_df[~unmatched_df['allosteric_pdb'].isin(unmatched2['allosteric_pdb'])]['allosteric_pdb']
matched_df = df2[df2['allosteric_pdb'].isin(matched_pdbs)]
matched_df = matched_df[save_cols]
matched_df
matched_df.to_excel('./allo_pro_lig_filter_pre/r3_allo_biolip_unmatch_pdb.xlsx', index=False, columns=save_cols)

0


Unnamed: 0,allosteric_pdb,modulator_chain,modulator_alias,modulator_resi,PDBID,Receptor chain,Resolution,Binding site,Ligand CCD,Ligand chain,...,GO terms,Binding affinity (manual),Binding affinity (Binding MOAD),Binding affinity (PDBbind-CN),Binding affinity (Binding DB),UniProt ID,PubMed ID,Ligand residue sequence number,Receptor sequence,_merge


In [116]:
df = pd.merge(
    allo_df,
    raw_df,
    how='left', 
    left_on=['allosteric_pdb', 'modulator_chain', 'modulator_alias', 'modulator_resi'],
    right_on=['PDBID', 'Ligand chain', 'Ligand CCD', 'Ligand residue sequence number'],
    indicator=True
)

unmatched = df[df['_merge'] == 'left_only']

df['is_duplicate'] = df.duplicated(subset=['allosteric_pdb', 'modulator_chain', 'modulator_alias', 'modulator_resi'], keep=False)
duplicated_rows = df[df['is_duplicate']]

both = df[df['_merge'] == 'both']
duplicated_first = both.drop_duplicates(subset=['allosteric_pdb', 'modulator_chain', 'modulator_alias', 'modulator_resi'], keep='first')

save_cols = [
    'UniProt ID',
    'PDBID',
    'Receptor chain',
    'Resolution',
    'Ligand CCD',
    'Ligand chain',
    'Ligand residue sequence number',
    'Ligand serial number',
    'Binding site',
    'Binding affinity (manual)',
    'Binding affinity (Binding MOAD)',
    'Binding affinity (PDBbind-CN)',
    'Binding affinity (Binding DB)',
    'Binding site residues',
    'Binding site residues renumbered',
    'Receptor sequence',
]

duplicated_first.to_excel(f'./allo_pro_lig_filter_pre/ASD_PL_2023_filter_r6_{len(duplicated_first)}.xlsx', index=False, columns=save_cols)


# Formal-process

## step1: X-ray structure high resolution limit at most 2.5 Å

In [36]:
resolution_limit = 2.5

df = pd.read_excel('./allo_pro_lig_filter_pre/ASD_PL_2023_filter_r6_2381.xlsx')
df = df[df['Resolution'] <= resolution_limit]
df.to_excel(f'./allo_pro_lig_filter_formal/r1_{len(df)}.xlsx', index=False)


## step2: Ligand validation score

In [129]:
def _fetch_pdb_ligand_validation_info(pdb_id, nonpolymer_comp_id, auth_asym_id, auth_seq_id):
    """
    input: 4BZB_DGT_B_700
    output: (RSR, RSCC, completeness)
    """
    query = (
    """{
        entry(entry_id: "%s"){
            nonpolymer_entities {
                rcsb_nonpolymer_entity_container_identifiers {
                nonpolymer_comp_id
                rcsb_id
                }
                nonpolymer_entity_instances {
                    rcsb_nonpolymer_entity_instance_container_identifiers {
                        auth_seq_id
                        auth_asym_id
                        asym_id
                        entity_id
                        entry_id
                    }
                    rcsb_nonpolymer_instance_validation_score {
                        RSCC
                        RSR
                        alt_id
                        completeness
                        intermolecular_clashes
                        is_best_instance
                        mogul_angle_outliers
                        mogul_angles_RMSZ
                        mogul_bond_outliers
                        mogul_bonds_RMSZ
                        ranking_model_fit
                        ranking_model_geometry
                        score_model_fit
                        score_model_geometry
                        stereo_outliers
                        average_occupancy
                    }
                }
            }
        }
        }"""
        % pdb_id
    )

    query_url = f"https://data.rcsb.org/graphql?query={query}"
    response = requests.get(query_url)
    response.raise_for_status()
    info = response.json()

    rsr, rsrc, completeness = None, None, None
    for entity in info['data']['entry']['nonpolymer_entities']:
        if entity['rcsb_nonpolymer_entity_container_identifiers']['nonpolymer_comp_id'] == nonpolymer_comp_id:
            for instance in entity['nonpolymer_entity_instances']:
                identifiers = instance['rcsb_nonpolymer_entity_instance_container_identifiers']
                if identifiers['auth_asym_id'] == auth_asym_id and identifiers['auth_seq_id'] == str(auth_seq_id):
                    validation_scores = instance['rcsb_nonpolymer_instance_validation_score']
                    if validation_scores and len(validation_scores) > 0:
                        rsr = validation_scores[0]['RSR']
                        rsrc = validation_scores[0]['RSCC']
                        completeness = validation_scores[0]['completeness']
    return rsr, rsrc, completeness


In [151]:
relatively_loose_rsr = 0.4
relatively_loose_rsrc = 0.9

r1 = pd.read_excel('./allo_pro_lig_filter_formal/r1_1666.xlsx')
r2_tmp = pd.read_excel('./allo_pro_lig_filter_formal/r2_tmp.xlsx')
pdbids = r2_tmp['PDBID']
rsr = r2_tmp['RSR']
rsrc = r2_tmp['RSCC']
completeness = r2_tmp['completeness']
r2 = r2_tmp[((rsr.notna() & (rsr <= 0.4)) & (rsrc.notna() & (rsrc >= 0.9)) & (completeness >= 1))]
# find the pdbid in r2 and in r1, and add columns of rsr, rsrc, completeness
r2_pdbids = r2['PDBID']
r1_pdbids = r1['PDBID']
df = r1[r1['PDBID'].isin(r2_pdbids)]
df['RSR'] = r2['RSR']
df['RSCC'] = r2['RSCC']
df['completeness'] = r2['completeness']
df.to_excel(f'./allo_pro_lig_filter_formal/r2_{len(df)}.xlsx', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RSR'] = r2['RSR']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RSCC'] = r2['RSCC']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['completeness'] = r2['completeness']


## step3: PDBBind-Opt

### prepare input for PDBBind-Opt

In [96]:
r2 = pd.read_excel('./allo_pro_lig_filter_formal/r2_1030.xlsx')
r2['PDBID'] = r2['PDBID'].str.lower()

columns = ['PDBID', 'Ligand chain', 'Ligand CCD', 'Ligand residue sequence number']
r2.to_csv('./allo_pro_lig_filter_formal/r3_tmp.csv', index=False, columns=columns)

### check the error types

In [102]:
record_dir = "/data02/venus/AlloBind/PDBBind-Opt/raw_data/allo_r2_1030"
pdbids = r2['PDBID']


def check_error(pdbid):
    directory = os.path.join(record_dir, pdbid)
    if os.path.isfile(os.path.join(directory, 'done.tag')):
        return pdbid
    else:
        return None
        # ! for inference
        # if os.path.isfile(os.path.join(directory, 'err')):
        #     with open(os.path.join(directory, 'err'), 'r') as f:
        #         errors = f.readlines()
        #         for error in errors:
        #             if 'RuntimeError: HETATM' in error:
        #                 check_mol = error.strip().split(' ')[2]
        #                 # print(f"{pdbid}: {error.strip()}")
        #                 pass
        #             elif 'AssertionError:' in error:
        #                 # print(f"{pdbid}: {error.strip()}")
        #                 pass
        #             elif 'Error occurs when fixing' in error:
        #                 # ! this can be fixed by set find_connected_ligand_residues to False
        #                 # print(f"{pdbid}: {error.strip()}")
        #                 pass
        # else:
        #     print(f"{pdbid} not processed successfully")

In [101]:
r2 = pd.read_excel('./allo_pro_lig_filter_formal/r2_1030.xlsx')
r2['PDBID'] = r2['PDBID'].str.lower()

r2_rerun = r2[r2['PDBID'].isin(rerun_pdbids)]
r2_rerun.to_csv(f'./allo_pro_lig_filter_formal/r3_{len(r2_rerun)}.csv', index=False)

## step4: ligands more than 6 heavy atoms, molar weight between 100 and 900

In [20]:
r3 = pd.read_csv('./allo_pro_lig_filter_formal/r3_852.csv')
r3['PDBID'] = r3['PDBID'].str.lower()

workdir = "/data02/venus/AlloBind/PDBBind-Opt/raw_data/allo_r2_1030"
success_pdbids = []
for index, row in r3.iterrows():
    pdbid = row['PDBID']
    ligand_ccd = row["Ligand CCD"]
    ligand_chain = row["Ligand chain"]
    ligand_resi = row["Ligand residue sequence number"]
    name = f"{pdbid}_{ligand_ccd}_{ligand_chain}_{ligand_resi}"
    sdf_path = os.path.join(workdir, pdbid, name, f"{name}_ligand_refined.sdf")
    
    mol = Chem.SDMolSupplier(sdf_path)[0]
    mol_noH = remove_all_hs(mol)

    num_heavy_atoms = mol_noH.GetNumHeavyAtoms()
    if num_heavy_atoms < 6:
        print(f"{pdbid} nha {num_heavy_atoms}")
        continue

    mw = Descriptors.MolWt(mol_noH)
    if mw > 900 or mw < 100:
        print(f"{pdbid} mw {mw:.2f}")
        continue
    
    success_pdbids.append(pdbid)

print(len(success_pdbids))

3u18 mw 900.86
1nsg mw 928.21
3fap mw 980.31
4cll nha 4
4cm2 nha 4
4oyz nha 4
1zdq nha 3
2g50 mw 89.09
5w5r mw 87.05
6u26 mw 1077.24
8d4i mw 90.17
5sae mw 96.09
840


In [21]:
r4 = r3[r3['PDBID'].isin(success_pdbids)]
r4.to_csv(f'./allo_pro_lig_filter_formal/r4_{len(r4)}.csv', index=False)

## step5: sequence similarity by CD-hit

In [33]:
r4 = pd.read_csv('./allo_pro_lig_filter_formal/r4_840.csv')

fasta_file = './allo_pro_lig_filter_formal/r4_840.fasta'
f = open(fasta_file, 'w')

for index, row in r4.iterrows():
    pdbid = row['PDBID']
    ligand_ccd = row["Ligand CCD"]
    ligand_chain = row["Ligand chain"]
    ligand_resi = row["Ligand residue sequence number"]
    moad = row['Binding affinity (Binding MOAD)']
    pdbbind = row['Binding affinity (PDBbind-CN)']
    binding_db = row['Binding affinity (Binding DB)']
    affinity_flag = int(pd.notna(moad) | pd.notna(pdbbind) | pd.notna(binding_db))
    resolution = row['Resolution']
    
    name = f"{pdbid}_{ligand_ccd}_{ligand_chain}_{ligand_resi}_{affinity_flag}_{resolution}"
    sequence = row['Receptor sequence']

    f.write(f'>{name}\n')
    f.write(f'{sequence}\n')
f.close()

cd-hit -i r4_840.fasta -o r4_840_cluster -c 0.9 -n 5 -M 16000 -d 0 -T 64

In [34]:
def parse_cdhit_clusters(clstr_file):
    """
    Parse CD-HIT cluster file into a dictionary mapping cluster numbers to lists of sequence names
    """
    clusters = {}
    current_cluster = None
    
    with open(clstr_file, 'r') as f:
        for line in f:
            line = line.strip()
         
            if line.startswith('>Cluster'):
                current_cluster = int(line.split()[1])
                clusters[current_cluster] = []
            else:
                seq_name = line.split('>')[1].split('...')[0]
                clusters[current_cluster].append(seq_name)
                
    return clusters


clusters = parse_cdhit_clusters("./allo_pro_lig_filter_formal/r4_840_cluster.clstr")
clusters

{0: ['6jta_GLN_A_1302_0_1.75'],
 1: ['2a68_RBT_C_8001_0_2.5', '2a69_RPT_C_8001_0_2.5'],
 2: ['1kee_ORN_A_5011_0_2.1',
  '1m6v_ORN_A_4011_0_2.1',
  '1t36_ORN_A_1089_0_2.1'],
 3: ['1jqn_ASP_A_884_0_2.35'],
 4: ['5i3p_68T_A_1003_0_2.45',
  '5i3q_68E_A_1003_0_1.88',
  '5jjr_68E_A_1004_1_1.99',
  '5jjs_6L2_A_1004_1_1.65',
  '6xd1_V0J_A_1003_0_1.954'],
 5: ['2y0p_ACO_A_2228_0_2.4'],
 6: ['5nn4_SC2_A_1016_1_1.83'],
 7: ['3mrt_12E_A_920_1_1.98',
  '3mrx_17S_A_920_1_1.95',
  '3ms4_21N_A_920_1_2.07',
  '3mt7_16O_A_920_1_2.0',
  '3mt8_17T_A_920_1_2.0',
  '3mt9_18O_A_920_1_2.05',
  '3mta_22O_A_920_1_2.23',
  '3mtb_23V_A_920_1_1.95',
  '3mqf_20X_A_920_1_1.951',
  '3mtd_25E_A_844_1_2.096',
  '3ebo_57D_A_940_1_1.9',
  '3ebp_CPB_A_940_1_2.0',
  '3nc4_26O_A_920_1_2.07',
  '3ms2_18S_A_920_1_2.1',
  '3mrv_16F_A_920_1_1.94',
  '2gj4_2TH_A_949_1_1.6'],
 8: ['3dds_26B_A_905_1_1.8', '3cem_AVD_A_833_1_2.47', '3ddw_055_A_905_1_1.9'],
 9: ['5n69_2OW_A_904_1_2.45'],
 10: ['3hnc_TTP_A_802_0_2.41'],
 11: ['3bz7_BL

## step6: ligand similarity by rdkit

In [58]:
def extract_information(system):
    pdbid = system.split('_')[0]
    ligand_ccd = system.split('_')[1]
    ligand_chain = system.split('_')[2]
    ligand_resi = system.split('_')[3]
    affinity_flag = system.split('_')[4]
    resolution = system.split('_')[5]
    name = f"{pdbid}_{ligand_ccd}_{ligand_chain}_{ligand_resi}"
    return name, pdbid, ligand_ccd, ligand_chain, ligand_resi, affinity_flag, resolution


def calc_ecfp4_similarity(mol1_file, mol2_file):
    """
    Calculate ECFP4 fingerprint similarity between two molecules from SDF files
    """
    mol1 = Chem.SDMolSupplier(mol1_file)[0]
    mol2 = Chem.SDMolSupplier(mol2_file)[0]
    
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2,fpSize=2048)

    fp1 = mfpgen.GetFingerprint(mol1)
    fp2 = mfpgen.GetFingerprint(mol2)

    similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
    
    return similarity


workdir = "/data02/venus/AlloBind/PDBBind-Opt/raw_data/allo_r2_1030"

clusters_ligand_diverse = {}
clusters_ligand_same = {}
for idx, cluster in clusters.items():
    if len(cluster) == 1:
        clusters_ligand_diverse[idx] = cluster
    else:
        sorted_cluter = sorted(cluster, key=lambda x: (-float(x.split('_')[-2]), float(x.split('_')[-1])))
        ref = sorted_cluter[0]
        ref_name, ref_pdbid, ref_ligand_ccd, ref_ligand_chain, ref_ligand_resi, ref_affinity_flag, ref_resolution = extract_information(ref)
        ref_sdf_path = os.path.join(workdir, ref_pdbid, ref_name, f"{ref_name}_ligand_refined.sdf")

        diverse_cluster = [ref, ]
        same_cluster = []
        for query in sorted_cluter[1:]:
            query_name, query_pdbid, query_ligand_ccd, query_ligand_chain, query_ligand_resi, query_affinity_flag, query_resolution = extract_information(query)
            query_sdf_path = os.path.join(workdir, query_pdbid, query_name, f"{query_name}_ligand_refined.sdf")

            # calculate similarity within diverse_cluster
            similarity_flag = 0
            for exist_ref in diverse_cluster:
                exist_ref_name, exist_ref_pdbid, exist_ref_ligand_ccd, exist_ref_ligand_chain, exist_ref_ligand_resi, exist_ref_affinity_flag, exist_ref_resolution = extract_information(exist_ref)
                exist_ref_sdf_path = os.path.join(workdir, exist_ref_pdbid, exist_ref_name, f"{exist_ref_name}_ligand_refined.sdf")

                similarity = calc_ecfp4_similarity(exist_ref_sdf_path, query_sdf_path)

                if similarity > 0.99:
                    similarity_flag = 1
                    pair = (exist_ref_name, query_name)
                    same_cluster.append(pair)
                
                elif similarity > 0.9:
                    similarity_flag = 2
            
            if not similarity_flag:
                diverse_cluster.append(query)
        
        # save different cluster
        clusters_ligand_diverse[idx] = diverse_cluster
        if len(same_cluster) > 0:
            clusters_ligand_same[idx] = same_cluster

diverse_num = len([len(j) for i in clusters_ligand_diverse.values() for j in i])
same_num = len([len(j) for i in clusters_ligand_same.values() for j in i])
print(f"diverse_num: {diverse_num}, same_num: {same_num}")

import json
with open(f'./allo_pro_lig_filter_formal/r6_clusters_ligand_diverse_{diverse_num}.json', 'w') as f:
    json.dump(clusters_ligand_diverse, f, indent=4)

with open(f'./allo_pro_lig_filter_formal/r6_clusters_ligand_same_{same_num}.json', 'w') as f:
    json.dump(clusters_ligand_same, f, indent=4)

diverse_num: 699, same_num: 134


only 4chp_IMV_A_1216 can be saved to 3nf6_IMV_A_230 (700)

## step7: different from pdbbind time-split

In [67]:
with open(f'./allo_pro_lig_filter_formal/r6_clusters_ligand_diverse_699.json', 'r') as f:
    clusters_ligand_diverse = json.load(f)

allo_pdbids = [j.split('_')[0] for i in clusters_ligand_diverse.values() for j in i]
allo_pdbids.append('4chp')
print('AlloBind pdbids:', len(allo_pdbids))

pdbbind_train = np.loadtxt('timesplit_no_lig_overlap_train.txt', dtype=str)
pdbbind_val = np.loadtxt('timesplit_no_lig_overlap_val.txt', dtype=str)
pdbbind_test = np.loadtxt('timesplit_test.txt', dtype=str)

allo_in_train = [i for i in allo_pdbids if i in pdbbind_train]
allo_in_val = [i for i in allo_pdbids if i in pdbbind_val]
allo_in_test = [i for i in allo_pdbids if i in pdbbind_test]

print('AlloBind in train:',len(allo_in_train))
print('AlloBind in val:', len(allo_in_val))
print('AlloBind in test:', len(allo_in_test))

allo_unique = [i for i in allo_pdbids if (i not in pdbbind_train) and (i not in pdbbind_val) and (i not in pdbbind_test)]
print('AlloBind unique:', len(allo_unique))

AlloBind pdbids: 700
AlloBind in train: 243
AlloBind in val: 18
AlloBind in test: 0
AlloBind unique: 439


In [68]:
r4 = pd.read_csv('./allo_pro_lig_filter_formal/r4_840.csv')
r7 = r4[r4['PDBID'].isin(allo_unique)]
r7.to_csv(f'./allo_pro_lig_filter_formal/r7_{len(r7)}.csv', index=False)

# Final collect the dataset

In [71]:
for idx, row in r7.iterrows():
    pdbid = row['PDBID']
    ligand_ccd = row['Ligand CCD']
    ligand_chain = row['Ligand chain']
    ligand_resi = row['Ligand residue sequence number']
    name = f"{pdbid}_{ligand_ccd}_{ligand_chain}_{ligand_resi}"

    os.makedirs(f"./allo_pro_lig_filter_formal/allobindv1_20250120_439/{pdbid}", exist_ok=True)
    ori_dir = f"/data02/venus/AlloBind/PDBBind-Opt/raw_data/allo_r2_1030/{pdbid}/{name}"
    os.system(f"cp -r {ori_dir}/{name}_ligand_refined.sdf ./allo_pro_lig_filter_formal/allobindv1_20250120_439/{pdbid}")
    os.system(f"cp -r {ori_dir}/{name}_protein_refined.pdb ./allo_pro_lig_filter_formal/allobindv1_20250120_439/{pdbid}")