# 04. Gather the datasets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import multiprocessing as mp

from standardize import standardize

[09:46:35] Initializing Normalizer


In [2]:
from rdkit.rdBase import BlockLogs
block = BlockLogs()

In [3]:
bdb = pd.read_csv('bdb.csv')
biolip = pd.read_csv('biolip.csv')
chembl = pd.read_csv('chembl.csv')

In [4]:
data = pd.concat([bdb, biolip, chembl], ignore_index=True)
data = data.dropna(subset=['ki', 'kd', 'ic50', 'ec50'], how='all')
data = data.dropna(subset=['smiles', 'sequence', 'uniprot_id'], how='any')
data

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source
0,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
3,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
4,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
...,...,...,...,...,...,...,...,...,...,...,...,...
3690158,COc1ccc(C2=N[C@@H](c3ccc(Cl)cc3)[C@@H](c3ccc(C...,,,45100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
3690159,O=c1nc2n(-c3ccc(Cl)cc3)c3ccc([N+](=O)[O-])cc3c...,,,55800.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
3690160,O=C1Nc2ccccc2C12/C(=N\CC1CC1)NC(=S)N2CC1CC1,,62800.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl
3690161,C[C@H](c1ccc(Cl)cc1N)N1C(=O)c2cc(I)ccc2N(CCCCC...,,77100.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl


In [5]:
with mp.Pool(mp.cpu_count()) as pool:
    standard_smiles = list(
        tqdm(pool.imap(standardize, data['smiles']), total=len(data), desc="Standardizing SMILES")
    )

Standardizing SMILES: 100%|██████████| 3683103/3683103 [12:24<00:00, 4947.61it/s]


In [6]:
data['smiles'] = standard_smiles
data

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source
0,O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)O)[C@H](Cc2ccccc...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
1,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
2,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
3,O=C1N(CCCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
4,O=C1N(CCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
...,...,...,...,...,...,...,...,...,...,...,...,...
3690158,COc1ccc(C2=N[C@@H](c3ccc(Cl)cc3)[C@@H](c3ccc(C...,,,45100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
3690159,O=c1nc2n(-c3ccc(Cl)cc3)c3ccc([N+](=O)[O-])cc3c...,,,55800.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
3690160,O=C1Nc2ccccc2C12/C(=N\CC1CC1)NC(=S)N2CC1CC1,,62800.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl
3690161,C[C@H](c1ccc(Cl)cc1N)N1C(=O)c2cc(I)ccc2N(CCCCC...,,77100.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl


In [7]:
def filter_by_source(group):
    # Prefer bdb data over biolip and chembl
    if 'bdb' in group['source'].values:
        return group[group['source'] == 'bdb']
    return group

data_with_bdb = data.groupby(['smiles', 'sequence'], group_keys=False).apply(filter_by_source)
data_with_bdb

  data_with_bdb = data.groupby(['smiles', 'sequence'], group_keys=False).apply(filter_by_source)


Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source
994804,*#Cc1cnc2c(Cl)cc(N[C@@H](C3=CN(C4CC4)NN3)c3ccc...,,8.000,,,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,=,,,P00533,bdb
1203252,*Bc1ccc(C)c(NC(=O)[C@@H]2C[C@@]3(CN4CC5(CCC5)C...,,1000.000,,,MHSWERLAVLVLLGAAACAAPPRGRILGGREAEAHARPYMASVQLN...,,<,,,P00746,bdb
1166508,*C(*)C1N=NC([C@H]2CC[C@H](Nc3cc(-c4ccc5cc(C#N)...,,0.639,,,MNKPITPSTYVRCLNVGLIRKLSDFIDPQEGWKKLAVAIKKPSGDD...,,=,,,Q9NWZ3,bdb
1130105,*C1CCN([C@@]2(N)CCCN(c3ccnc4[nH]cc(-c5cncnc5)c...,9.33,,,,MSAKVRLKKLEQLLLDGPWRNESALSVETLLDVLVCLYTECSHSAL...,=,,,,Q9Y5S2,bdb
1130107,*C1CCN([C@@]2(N)CCCN(c3ccnc4[nH]cc(-c5cncnc5)c...,495.00,,,,MSRPPPTGKMPGAPETAPGDGAGASRQRKLEALIRDPRSPINVESL...,=,,,,O75116,bdb
...,...,...,...,...,...,...,...,...,...,...,...,...
68403,c1nnnn1C12CC3CC(CC(C3)C1)C2,,100000.000,,,MPCIQAQYGTPAPSPGPRDHLASDPLTPEFIKPTMDLASPEAAPAA...,,>,,,P22736,bdb
1354123,c1nsnc1OC1CN2CCC1CC2,,833.000,,,MNTSVPPAVSPNITVLAPGKGPWQVAFIGITTGLLSLATVTGNLLV...,,=,,,P08482,bdb
1403576,c1nsnc1OC1CN2CCC1CC2,,537.000,,,MNTSVPPAVSPNITVLAPGKGPWQVAFIGITTGLLSLATVTGNLLV...,,=,,,P08482,bdb
2035072,c1scc2c1CCCC2CCCN1CCCCC1,1430.00,,,,MDVFSFGQGNNTTASQEPFGTGGNVTSISDVTFSYQVITSLLLGTL...,>,,,,P19327,bdb


In [8]:
data_no_asterisk = data_with_bdb[~data_with_bdb['smiles'].str.contains('\*', na=False)].reset_index(drop=True)
data_no_asterisk

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source
0,BP(=O)(COCCn1cnc2c(N)ncnc21)OP(=O)(O)OP(=O)(O)O,,17000.0,,,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,=,,,Q9WKE8,bdb
1,BP(=O)(COCCn1cnc2c(N)ncnc21)OP(=O)(O)OP(=O)(O)O,,,6500.0,,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,,=,,Q9WKE8,bdb
2,BP(=O)(CO[C@H](C)Cn1cnc2c(N)ncnc21)OP(=O)(O)OP...,,14500.0,,,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,=,,,Q9WKE8,bdb
3,BP(=O)(CO[C@H](C)Cn1cnc2c(N)ncnc21)OP(=O)(O)OP...,,,33000.0,,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,,=,,Q9WKE8,bdb
4,BP(=O)(OCC1C=CC(n2cc(C)c(=O)[nH]c2=O)O1)OP(=O)...,45.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb
...,...,...,...,...,...,...,...,...,...,...,...,...
2888711,c1nnnn1C12CC3CC(CC(C3)C1)C2,,100000.0,,,MPCIQAQYGTPAPSPGPRDHLASDPLTPEFIKPTMDLASPEAAPAA...,,>,,,P22736,bdb
2888712,c1nsnc1OC1CN2CCC1CC2,,833.0,,,MNTSVPPAVSPNITVLAPGKGPWQVAFIGITTGLLSLATVTGNLLV...,,=,,,P08482,bdb
2888713,c1nsnc1OC1CN2CCC1CC2,,537.0,,,MNTSVPPAVSPNITVLAPGKGPWQVAFIGITTGLLSLATVTGNLLV...,,=,,,P08482,bdb
2888714,c1scc2c1CCCC2CCCN1CCCCC1,1430.0,,,,MDVFSFGQGNNTTASQEPFGTGGNVTSISDVTFSYQVITSLLLGTL...,>,,,,P19327,bdb


In [9]:
sequence_to_uniprot_id = data_no_asterisk[['sequence', 'uniprot_id']].drop_duplicates().set_index('sequence')['uniprot_id'].to_dict()

# Get PLINDER split

In [13]:
PATH_TO_PLINDER_ANNOTATION = "../../mol-finder/data/raw/plinder/annotation_table.parquet"
annotation_data = pd.read_parquet(PATH_TO_PLINDER_ANNOTATION)
annotation_data

Unnamed: 0,entry_pdb_id,entry_release_date,entry_oligomeric_state,entry_determination_method,entry_keywords,entry_pH,entry_resolution,entry_rfree,entry_r,entry_clashscore,...,ligand_interacting_ligand_chains_UniProt,system_ligand_chains_PANTHER,ligand_interacting_ligand_chains_Pfam,ligand_neighboring_ligand_chains_Pfam,ligand_interacting_ligand_chains_PANTHER,ligand_neighboring_ligand_chains_PANTHER,system_ligand_chains_SCOP2,system_ligand_chains_SCOP2B,pli_qcov__100__strong__component,protein_lddt_qcov_weighted_sum__100__strong__component
0,3grt,1997-02-12,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,8.0,2.50,,0.1700,12.90,...,,,,,,,,,c243140,c635
1,3grt,1997-02-12,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,8.0,2.50,,0.1700,12.90,...,,,,,,,,,c169758,c635
2,3grt,1997-02-12,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,8.0,2.50,,0.1700,12.90,...,,,,,,,,,c242976,c635
3,3grt,1997-02-12,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,8.0,2.50,,0.1700,12.90,...,,,,,,,,,c173553,c635
4,1grx,1993-10-01,monomeric,SOLUTION NMR,ELECTRON TRANSPORT,,,,,,...,,,,,,,,,c186761,c167274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1357899,4lpn,2013-07-16,24-meric,X-RAY DIFFRACTION,OXIDOREDUCTASE,9,1.66,0.1768,0.1586,3.34,...,,,,,,,,,,
1357900,2lp3,2012-01-31,dimeric,SOLUTION NMR,METAL BINDING PROTEIN,,,,,,...,,,,,,,,,,
1357901,2lp3,2012-01-31,dimeric,SOLUTION NMR,METAL BINDING PROTEIN,,,,,,...,,,,,,,,,,
1357902,2lp3,2012-01-31,dimeric,SOLUTION NMR,METAL BINDING PROTEIN,,,,,,...,,,,,,,,,,


In [14]:
system_to_uniprot = annotation_data.set_index('system_id')['system_pocket_UniProt'].to_dict()

In [15]:
PATH_TO_SPLIT = "../../mol-finder/data/raw/plinder/split.parquet"
split_data = pd.read_parquet(PATH_TO_SPLIT)
split_data

Unnamed: 0,system_id,uniqueness,split,cluster,cluster_for_val_split,system_pass_validation_criteria,system_pass_statistics_criteria,system_proper_num_ligand_chains,system_proper_pocket_num_residues,system_proper_num_interactions,system_proper_ligand_max_molecular_weight,system_has_binding_affinity,system_has_apo_or_pred
0,101m__1__1.A__1.C_1.D,101m__A__C_D_c188899,train,c14,c0,True,True,1,27,20,616.177293,False,False
1,102m__1__1.A__1.C,102m__A__C_c237197,train,c14,c0,True,True,1,26,20,616.177293,False,True
2,103m__1__1.A__1.C_1.D,103m__A__C_D_c252759,train,c14,c0,False,True,1,26,16,616.177293,False,False
3,104m__1__1.A__1.C_1.D,104m__A__C_D_c274687,train,c14,c0,False,True,1,27,21,616.177293,False,False
4,105m__1__1.A__1.C_1.D,105m__A__C_D_c221688,train,c14,c0,False,True,1,28,20,616.177293,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
409721,9xia__1__2.A_4.A__4.B_4.D,9xia__A_A__B_D_c20731,train,c256,c126,False,False,1,23,6,178.084124,False,False
409722,9xim__1__1.A_1.B__1.E_1.F_1.G,9xim__A_B__E_F_G_c240203,train,c256,c126,False,False,1,21,6,150.052823,False,False
409723,9xim__1__1.A_1.B__1.H_1.I_1.J,9xim__A_B__H_I_J_c313183,train,c256,c126,False,False,1,19,5,150.052823,False,False
409724,9xim__1__1.C_1.D__1.K_1.L_1.M,9xim__C_D__K_L_M_c215891,train,c256,c126,False,False,1,20,3,150.052823,False,False


In [17]:
train_uniprots = set()
test_uniprots = set()

for _, row in tqdm(split_data.iterrows(), total=len(split_data)):
    if row['split'] == 'train':
        train_uniprots.add(system_to_uniprot[row['system_id']])
    elif row['split'] in ['val', 'test']:
        test_uniprots.add(system_to_uniprot[row['system_id']])

100%|██████████| 409726/409726 [00:15<00:00, 25648.76it/s]


In [18]:
data_uniprots = data_no_asterisk['uniprot_id'].to_list()
split = []

for uniprot in data_uniprots:
    if uniprot in train_uniprots:
        split.append('train')
    elif uniprot in test_uniprots:
        split.append('test')
    else:
        split.append(None)

data_no_asterisk['split'] = split
data_no_asterisk

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split
0,BP(=O)(COCCn1cnc2c(N)ncnc21)OP(=O)(O)OP(=O)(O)O,,17000.0,,,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,=,,,Q9WKE8,bdb,
1,BP(=O)(COCCn1cnc2c(N)ncnc21)OP(=O)(O)OP(=O)(O)O,,,6500.0,,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,,=,,Q9WKE8,bdb,
2,BP(=O)(CO[C@H](C)Cn1cnc2c(N)ncnc21)OP(=O)(O)OP...,,14500.0,,,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,=,,,Q9WKE8,bdb,
3,BP(=O)(CO[C@H](C)Cn1cnc2c(N)ncnc21)OP(=O)(O)OP...,,,33000.0,,PISPITVPVKLKPGMDGPKVKQWPLTEEKIKALTEICTEMEKEGKI...,,,=,,Q9WKE8,bdb,
4,BP(=O)(OCC1C=CC(n2cc(C)c(=O)[nH]c2=O)O1)OP(=O)...,45.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2888711,c1nnnn1C12CC3CC(CC(C3)C1)C2,,100000.0,,,MPCIQAQYGTPAPSPGPRDHLASDPLTPEFIKPTMDLASPEAAPAA...,,>,,,P22736,bdb,train
2888712,c1nsnc1OC1CN2CCC1CC2,,833.0,,,MNTSVPPAVSPNITVLAPGKGPWQVAFIGITTGLLSLATVTGNLLV...,,=,,,P08482,bdb,
2888713,c1nsnc1OC1CN2CCC1CC2,,537.0,,,MNTSVPPAVSPNITVLAPGKGPWQVAFIGITTGLLSLATVTGNLLV...,,=,,,P08482,bdb,
2888714,c1scc2c1CCCC2CCCN1CCCCC1,1430.0,,,,MDVFSFGQGNNTTASQEPFGTGGNVTSISDVTFSYQVITSLLLGTL...,>,,,,P19327,bdb,


In [21]:
data_no_asterisk = data_no_asterisk.dropna(subset=['split']).reset_index(drop=True)
data_no_asterisk

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split
0,BP(=O)(OCC1C=CC(n2cc(C)c(=O)[nH]c2=O)O1)OP(=O)...,45.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
1,BP(=O)(OCC1C=CC(n2cnc3c(=O)[nH]c(N)nc32)C1)OP(...,34.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
2,BP(=O)(OCC1CCC(n2cc(C)c(=O)[nH]c2=O)O1)OP(=O)(...,236.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
3,BP(=O)(OCC1CCC(n2ccc(=O)[nH]c2=O)O1)OP(=O)(O)C...,438.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
4,BP(=O)(OCC1CCC(n2ccc(N)nc2=O)O1)OP(=O)(O)C(F)(...,23.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2060173,c1nnc(NCc2ccc3c(c2)OCO3)[nH]1,,17940.0,,,MYFSSLCKFLPISEKEKIYLNIVKKRFCKSNIYYNNNNNNIINYNK...,,=,,,Q8IL11,bdb,train
2060174,c1nnc(SSc2nncs2)s1,,560.0,,,MADQAPFDTDVNTLTRFVMEEGRKARGTGELTQLLNSLCTAVKAIS...,,=,,,P09467,bdb,train
2060175,c1nnc2n1CCCC2,,244000.0,,,MARTTSQLYDAVPIQSSVVLCSCPSPSMVRTQTESSTPPGIPGGSR...,,=,,,O15530,bdb,train
2060176,c1nnc2nnc(N3CCOCC3)nn12,,11770.0,,,MKAPAVLAPGILVLLFTLVQRSNGECKEALAKSEMNVNMKYQLPNF...,,=,,,P08581,bdb,train


In [22]:
data_no_asterisk.to_csv('raw_data.csv', index=False)

# Binarization and deduplication

In [20]:
data = pd.read_csv('raw_no_pains_data.csv')
data

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split
0,BP(=O)(OCC1C=CC(n2cc(C)c(=O)[nH]c2=O)O1)OP(=O)...,45.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
1,BP(=O)(OCC1C=CC(n2cnc3c(=O)[nH]c(N)nc32)C1)OP(...,34.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
2,BP(=O)(OCC1CCC(n2cc(C)c(=O)[nH]c2=O)O1)OP(=O)(...,236.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
3,BP(=O)(OCC1CCC(n2ccc(=O)[nH]c2=O)O1)OP(=O)(O)C...,438.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
4,BP(=O)(OCC1CCC(n2ccc(N)nc2=O)O1)OP(=O)(O)C(F)(...,23.0,,,,PISPIETVPVKLKPGMDGPKVKQWPLTEEKIKALVEICTEMEKEGK...,=,,,,Q72547,bdb,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1982636,c1nnc(NCc2ccc3c(c2)OCO3)[nH]1,,17940.0,,,MYFSSLCKFLPISEKEKIYLNIVKKRFCKSNIYYNNNNNNIINYNK...,,=,,,Q8IL11,bdb,train
1982637,c1nnc(SSc2nncs2)s1,,560.0,,,MADQAPFDTDVNTLTRFVMEEGRKARGTGELTQLLNSLCTAVKAIS...,,=,,,P09467,bdb,train
1982638,c1nnc2n1CCCC2,,244000.0,,,MARTTSQLYDAVPIQSSVVLCSCPSPSMVRTQTESSTPPGIPGGSR...,,=,,,O15530,bdb,train
1982639,c1nnc2nnc(N3CCOCC3)nn12,,11770.0,,,MKAPAVLAPGILVLLFTLVQRSNGECKEALAKSEMNVNMKYQLPNF...,,=,,,P08581,bdb,train


In [21]:
train = data[data['split'] == 'train'].reset_index(drop=True)
test = data[data['split'] == 'test'].reset_index(drop=True)

is_ki_nan = test['ki'].isna()
is_kd_nan = test['kd'].isna()
test = test[~is_ki_nan | ~is_kd_nan]
test

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split
8,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,,,0.195,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test
11,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,,,42.000,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test
12,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,,,510.000,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test
24,B[P@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[P...,,,49.000,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test
25,B[P@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[P...,,,1270.000,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120884,c1nc(NCC2CCOC2)c2cc(-c3ccc4c(c3)OCO4)ccc2n1,,,310.000,,MASSSVPPATVSAATAGPGPGFGFASKTKKKHFVQQKVKVFRAADP...,,,=,,Q8TBX8,bdb,test
120891,c1nc2cc(NCc3ccc4c(c3)OCCO4)cnc2[nH]1,543.0,,,,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,=,,,,Q16769,bdb,test
120892,c1nc2cc(NCc3ccsc3)cnc2[nH]1,1870.0,,,,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,=,,,,Q16769,bdb,test
120893,c1nc2ccc(-c3nnc(CCc4ccc5c(c4)OCO5)o3)cc2[nH]1,415.0,,,,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,=,,,,Q16769,bdb,test


In [22]:
def binarize_row(row, threshold=1000):
    activity_cols = ['ki', 'kd']
    sign_cols = ['ki_sign', 'kd_sign']
    for activity_col, sign_col in zip(activity_cols, sign_cols):
        value = row[activity_col]
        sign = row[sign_col]
        if pd.notnull(value):
            if sign == '=':
                return 1 if value < threshold else 0
            elif sign == '>':
                return 0 if value > threshold else np.nan
            elif sign == '<':
                return 1 if value < threshold else np.nan
            else:
                return np.nan
    return np.nan  # No activity value found

test['is_active'] = test.apply(lambda row: binarize_row(row, threshold=1000), axis=1)
test

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split,is_active
8,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,,,0.195,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test,1.0
11,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,,,42.000,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test,1.0
12,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,,,510.000,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test,1.0
24,B[P@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[P...,,,49.000,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test,1.0
25,B[P@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[P...,,,1270.000,,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,=,,Q86WV6,bdb,test,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120884,c1nc(NCC2CCOC2)c2cc(-c3ccc4c(c3)OCO4)ccc2n1,,,310.000,,MASSSVPPATVSAATAGPGPGFGFASKTKKKHFVQQKVKVFRAADP...,,,=,,Q8TBX8,bdb,test,1.0
120891,c1nc2cc(NCc3ccc4c(c3)OCCO4)cnc2[nH]1,543.0,,,,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,=,,,,Q16769,bdb,test,1.0
120892,c1nc2cc(NCc3ccsc3)cnc2[nH]1,1870.0,,,,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,=,,,,Q16769,bdb,test,0.0
120893,c1nc2ccc(-c3nnc(CCc4ccc5c(c4)OCO5)o3)cc2[nH]1,415.0,,,,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,=,,,,Q16769,bdb,test,1.0


In [23]:
def is_consistent(group):
    labels = group['is_active'].dropna().unique()
    return len(labels) == 1

# Filter groups with consistent labels
consistent_groups = test.groupby(['smiles', 'sequence']).filter(is_consistent)

# Deduplicate by keeping one row per group
test = consistent_groups.groupby(['smiles', 'sequence'], as_index=False).first()
test

Unnamed: 0,smiles,sequence,ki,ic50,kd,ec50,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split,is_active
0,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,0.195,,,,=,,Q86WV6,bdb,test,1.0
1,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,42.000,,,,=,,Q86WV6,bdb,test,1.0
2,B[P@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[P...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,,,1.580,,,,=,,Q86WV6,bdb,test,1.0
3,Brc1cccc(COc2ccc(C3C[C@@H]3NCCC3CCCCC3)cc2)c1,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...,550.0,,,,=,,,,P27338,bdb,test,1.0
4,Brc1cccc(COc2ccc3c(c2)OCCO3)c1,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...,41.0,,,,=,,,,P27338,bdb,test,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14366,c1nc(NCC2CCOC2)c2cc(-c3ccc4c(c3)OCO4)ccc2n1,MASSSVPPATVSAATAGPGPGFGFASKTKKKHFVQQKVKVFRAADP...,,,310.000,,,,=,,Q8TBX8,bdb,test,1.0
14367,c1nc2cc(NCc3ccc4c(c3)OCCO4)cnc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,543.0,,,,=,,,,Q16769,bdb,test,1.0
14368,c1nc2cc(NCc3ccsc3)cnc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,1870.0,,,,=,,,,Q16769,bdb,test,0.0
14369,c1nc2ccc(-c3nnc(CCc4ccc5c(c4)OCO5)o3)cc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,415.0,,,,=,,,,Q16769,bdb,test,1.0


In [24]:
test['is_active'].value_counts()

is_active
1.0    8524
0.0    5847
Name: count, dtype: int64

In [25]:
test = test.drop(columns=['ki', 'kd', 'ki_sign', 'kd_sign', 'ic50', 'ec50', 'ic50_sign', 'ec50_sign'])
test

Unnamed: 0,smiles,sequence,uniprot_id,source,split,is_active
0,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,Q86WV6,bdb,test,1.0
1,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,Q86WV6,bdb,test,1.0
2,B[P@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[P...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,Q86WV6,bdb,test,1.0
3,Brc1cccc(COc2ccc(C3C[C@@H]3NCCC3CCCCC3)cc2)c1,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...,P27338,bdb,test,1.0
4,Brc1cccc(COc2ccc3c(c2)OCCO3)c1,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...,P27338,bdb,test,1.0
...,...,...,...,...,...,...
14366,c1nc(NCC2CCOC2)c2cc(-c3ccc4c(c3)OCO4)ccc2n1,MASSSVPPATVSAATAGPGPGFGFASKTKKKHFVQQKVKVFRAADP...,Q8TBX8,bdb,test,1.0
14367,c1nc2cc(NCc3ccc4c(c3)OCCO4)cnc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,Q16769,bdb,test,1.0
14368,c1nc2cc(NCc3ccsc3)cnc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,Q16769,bdb,test,0.0
14369,c1nc2ccc(-c3nnc(CCc4ccc5c(c4)OCO5)o3)cc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,Q16769,bdb,test,1.0


In [26]:
test.to_csv('test.csv', index=False)

In [28]:
train.to_csv('train.csv', index=False)