In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/raw/BioLiP_nr.txt', sep='\t', header=None)

columns = [
    "pdb_id",                      # 01: PDB ID
    "receptor_chain",              # 02: Receptor chain
    "resolution",                  # 03: Resolution (-1.00 indicates missing data)
    "binding_site_id",             # 04: Binding site number code
    "ligand_id",                   # 05: Ligand ID (CCD code)
    "ligand_chain",                # 06: Ligand chain
    "ligand_serial_number",        # 07: Ligand serial number
    "binding_residues_pdb",        # 08: Binding site residues (PDB numbering)
    "binding_residues_renum",      # 09: Binding site residues (renumbered from 1)
    "catalytic_residues_pdb",      # 10: Catalytic site residues (PDB numbering)
    "catalytic_residues_renum",    # 11: Catalytic site residues (renumbered from 1)
    "ec_number",                   # 12: EC number
    "go_terms",                    # 13: GO terms
    "binding_affinity_literature", # 14: Binding affinity (manual survey)
    "binding_affinity_moad",       # 15: Binding affinity (Binding MOAD)
    "binding_affinity_pdbbind",    # 16: Binding affinity (PDBbind-CN)
    "binding_affinity_bindingdb",  # 17: Binding affinity (BindingDB)
    "uniprot_id",                  # 18: UniProt ID
    "pubmed_id",                   # 19: PubMed ID
    "ligand_residue_seq_number",   # 20: Ligand residue sequence number
    "receptor_sequence"            # 21: Receptor sequence
]

df.columns = columns

  df = pd.read_csv('../data/raw/BioLiP_nr.txt', sep='\t', header=None)


In [3]:
df.head()

Unnamed: 0,pdb_id,receptor_chain,resolution,binding_site_id,ligand_id,ligand_chain,ligand_serial_number,binding_residues_pdb,binding_residues_renum,catalytic_residues_pdb,...,ec_number,go_terms,binding_affinity_literature,binding_affinity_moad,binding_affinity_pdbbind,binding_affinity_bindingdb,uniprot_id,pubmed_id,ligand_residue_seq_number,receptor_sequence
0,10mh,A,2.55,BS01,dna,B,0,I86 Q90 R209 G236 Q237 G256 G257,I86 Q90 R209 G236 Q237 G256 G257,C81 E119 R163 R165,...,2.1.1.37,00036770003886000816800093070032259,,,,,P05102,9925782.0,402 ~ 413,MIEIKDKQLTGLRFIDLFAGLGGFRLALESCGAECVYSNEWDKYAQ...
1,10mh,A,2.55,BS02,dna,C,0,F79 C81 S87 E119 R165 R228 Q237 R240 I249 T250...,F79 C81 S87 E119 R165 R228 Q237 R240 I249 T250...,C81 E119 R163 R165,...,2.1.1.37,00036770003886000816800093070032259,,,,,P05102,9925782.0,422 ~ 433,MIEIKDKQLTGLRFIDLFAGLGGFRLALESCGAECVYSNEWDKYAQ...
2,10mh,A,2.55,BS03,SAH,A,1,F18 A19 G20 L21 G23 E40 W41 D60 N304 S305,F18 A19 G20 L21 G23 E40 W41 D60 N304 S305,C81 E119 R163 R165,...,2.1.1.37,00036770003886000816800093070032259,,,,,P05102,9925782.0,328,MIEIKDKQLTGLRFIDLFAGLGGFRLALESCGAECVYSNEWDKYAQ...
3,11as,A,2.5,BS01,ASN,A,1,D46 A74 Q116 Y218 S251 R255,D43 A71 Q113 Y215 S248 R252,D46 R100 Q116 D235 E248 S251,...,6.3.1.1,"0004071,0005524,0005737,0005829,0006529,000697...",,,,,P00963,9437423.0,331,AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...
4,11ba,A,2.06,BS01,UPA,A,1,K41 V43 N44 T45 N67 Q69 N71 A109 H119 F120,K41 V43 N44 T45 N67 Q69 N71 A109 H119 F120,H12 K41 H119 F120 D121,...,4.6.1.18,"0003676,0004519,0004522,0004540,0005576,001682...",,,,,P00669,10082366.0,125,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...


In [4]:
residue_label_positve_df = pd.read_csv('../data/processed/BioLiP_positives_residue_level_with_duplicates.csv')

In [5]:
# Set of all positive residues to exclude
positive_keys = set(
    zip(
        residue_label_positve_df["pdb_id"],
        residue_label_positve_df["chain_id"],
        residue_label_positve_df["residue_number"].astype(int)
    )
)

In [None]:
unlabeled_residues = []

for idx, row in df.iterrows():
    pdb_id = row["pdb_id"]
    chain_id = row["receptor_chain"]
    sequence = row["receptor_sequence"]
    resolution = row["resolution"]
    uniprot_id = row["uniprot_id"]
    source_index = idx

    if pd.isna(sequence):
        continue 

    for i, aa in enumerate(sequence, start=1):  
        residue_key = (pdb_id, chain_id, i)
        if residue_key in positive_keys:
            continue  # This residue is already annotated as binding

        unlabeled_residues.append({
            "residue_id": f"{aa}{i}",
            "residue_name": aa,
            "residue_number": i,
            "pdb_id": pdb_id,
            "chain_id": chain_id,
            "uniprot_id": uniprot_id,
            "resolution": resolution,
            "source_index": source_index
        })

In [7]:
# Convert to DataFrame
unlabeled_residue_df = pd.DataFrame(unlabeled_residues)

# Optional: sort for inspection
unlabeled_residue_df.sort_values(by=["pdb_id", "chain_id", "residue_number"], inplace=True)

In [8]:
unlabeled_residue_df.head()

Unnamed: 0,residue_id,residue_name,residue_number,pdb_id,chain_id,uniprot_id,resolution,source_index
0,M1,M,1,10mh,A,P05102,2.55,0
297,M1,M,1,10mh,A,P05102,2.55,1
594,M1,M,1,10mh,A,P05102,2.55,2
1,I2,I,2,10mh,A,P05102,2.55,0
298,I2,I,2,10mh,A,P05102,2.55,1


In [9]:
unlabeled_residue_df.to_csv('../data/processed/BioLiP_unlabeled_residues_with_duplicates.csv', index=False)

In [10]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [11]:
df.head()

Unnamed: 0,pdb_id,receptor_chain,resolution,binding_site_id,ligand_id,ligand_chain,ligand_serial_number,binding_residues_pdb,binding_residues_renum,catalytic_residues_pdb,catalytic_residues_renum,ec_number,go_terms,binding_affinity_literature,binding_affinity_moad,binding_affinity_pdbbind,binding_affinity_bindingdb,uniprot_id,pubmed_id,ligand_residue_seq_number,receptor_sequence
0,10mh,A,2.55,BS01,dna,B,0,I86 Q90 R209 G236 Q237 G256 G257,I86 Q90 R209 G236 Q237 G256 G257,C81 E119 R163 R165,C81 E119 R163 R165,2.1.1.37,00036770003886000816800093070032259,,,,,P05102,9925782.0,402 ~ 413,MIEIKDKQLTGLRFIDLFAGLGGFRLALESCGAECVYSNEWDKYAQ...
1,10mh,A,2.55,BS02,dna,C,0,F79 C81 S87 E119 R165 R228 Q237 R240 I249 T250...,F79 C81 S87 E119 R165 R228 Q237 R240 I249 T250...,C81 E119 R163 R165,C81 E119 R163 R165,2.1.1.37,00036770003886000816800093070032259,,,,,P05102,9925782.0,422 ~ 433,MIEIKDKQLTGLRFIDLFAGLGGFRLALESCGAECVYSNEWDKYAQ...
2,10mh,A,2.55,BS03,SAH,A,1,F18 A19 G20 L21 G23 E40 W41 D60 N304 S305,F18 A19 G20 L21 G23 E40 W41 D60 N304 S305,C81 E119 R163 R165,C81 E119 R163 R165,2.1.1.37,00036770003886000816800093070032259,,,,,P05102,9925782.0,328,MIEIKDKQLTGLRFIDLFAGLGGFRLALESCGAECVYSNEWDKYAQ...
3,11as,A,2.5,BS01,ASN,A,1,D46 A74 Q116 Y218 S251 R255,D43 A71 Q113 Y215 S248 R252,D46 R100 Q116 D235 E248 S251,D43 R97 Q113 D232 E245 S248,6.3.1.1,"0004071,0005524,0005737,0005829,0006529,000697...",,,,,P00963,9437423.0,331,AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...
4,11ba,A,2.06,BS01,UPA,A,1,K41 V43 N44 T45 N67 Q69 N71 A109 H119 F120,K41 V43 N44 T45 N67 Q69 N71 A109 H119 F120,H12 K41 H119 F120 D121,H12 K41 H119 F120 D121,4.6.1.18,"0003676,0004519,0004522,0004540,0005576,001682...",,,,,P00669,10082366.0,125,KESAAAKFERQHMDSGNSPSSSSNYCNLMMCCRKMTQGKCKPVNTF...


In [12]:
df.columns

Index(['pdb_id', 'receptor_chain', 'resolution', 'binding_site_id',
       'ligand_id', 'ligand_chain', 'ligand_serial_number',
       'binding_residues_pdb', 'binding_residues_renum',
       'catalytic_residues_pdb', 'catalytic_residues_renum', 'ec_number',
       'go_terms', 'binding_affinity_literature', 'binding_affinity_moad',
       'binding_affinity_pdbbind', 'binding_affinity_bindingdb', 'uniprot_id',
       'pubmed_id', 'ligand_residue_seq_number', 'receptor_sequence'],
      dtype='object')