# 04. Merge the datasets

In [1]:
import multiprocessing as mp
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from rdkit.rdBase import BlockLogs
from tqdm import tqdm

from standardize import standardize

[20:37:37] Initializing Normalizer


In [2]:
DATA_DIR = Path('../data/')

In [3]:
block = BlockLogs()

In [4]:
bdb = pd.read_csv(DATA_DIR / 'bdb.csv')
biolip = pd.read_csv(DATA_DIR / 'biolip.csv')
chembl = pd.read_csv(DATA_DIR / 'chembl.csv')

In [5]:
data = pd.concat([bdb, biolip, chembl], ignore_index=True)
data = data.dropna(subset=['ki', 'kd', 'ic50', 'ec50'], how='all')
data = data.dropna(subset=['smiles', 'sequence', 'uniprot_id'], how='any')
data

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source
0,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
3,OCCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
4,OCCCCCN1[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)[C@@H...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
...,...,...,...,...,...,...,...,...,...,...,...,...
3703516,COc1ccc(C2=N[C@@H](c3ccc(Cl)cc3)[C@@H](c3ccc(C...,,,45100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
3703517,O=c1nc2n(-c3ccc(Cl)cc3)c3ccc([N+](=O)[O-])cc3c...,,,55800.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
3703518,O=C1Nc2ccccc2C12/C(=N\CC1CC1)NC(=S)N2CC1CC1,,62800.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl
3703519,C[C@H](c1ccc(Cl)cc1N)N1C(=O)c2cc(I)ccc2N(CCCCC...,,77100.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl


In [6]:
with mp.Pool(mp.cpu_count()) as pool:
    data['smiles'] = list(
        tqdm(pool.imap(standardize, data['smiles']), total=len(data), desc="Standardizing SMILES")
    )

Standardizing SMILES:   0%|          | 0/3696409 [00:00<?, ?it/s]

Standardizing SMILES: 100%|██████████| 3696409/3696409 [11:48<00:00, 5218.30it/s]


In [7]:
# Remove rows with invalid SMILES after standardization
data = data[~data['smiles'].isna()].reset_index(drop=True)
# Filter out SMILES with asterisks
data = data[~data['smiles'].str.contains('\*', na=False)].reset_index(drop=True)
data

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source
0,O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)O)[C@H](Cc2ccccc...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
1,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
2,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
3,O=C1N(CCCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
4,O=C1N(CCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
...,...,...,...,...,...,...,...,...,...,...,...,...
3628040,COc1ccc(C2=N[C@@H](c3ccc(Cl)cc3)[C@@H](c3ccc(C...,,,45100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
3628041,O=c1nc2n(-c3ccc(Cl)cc3)c3ccc([N+](=O)[O-])cc3c...,,,55800.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
3628042,O=C1Nc2ccccc2C12/C(=N\CC1CC1)NC(=S)N2CC1CC1,,62800.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl
3628043,C[C@H](c1ccc(Cl)cc1N)N1C(=O)c2cc(I)ccc2N(CCCCC...,,77100.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl


In [8]:
# Prefer BindingDB data over BioLip and ChEMBL
group_has_bdb = data.groupby(['smiles', 'sequence'])['source'].transform(lambda x: x.eq('bdb').any())
mask = (~group_has_bdb) | (data['source'] == 'bdb')
data = data[mask].reset_index(drop=True)

In [9]:
data

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source
0,O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)O)[C@H](Cc2ccccc...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
1,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
2,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
3,O=C1N(CCCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
4,O=C1N(CCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb
...,...,...,...,...,...,...,...,...,...,...,...,...
2901730,Clc1ccc2c(C(NCC3CC3)c3cccc4ccccc34)c[nH]c2c1,,,18100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
2901731,O=C(O)[C@H]1c2ccccc2C(=O)N(Cc2ccc(Cl)cc2)[C@@H...,,18300.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl
2901732,COCCNC(=O)[C@H]1c2ccccc2C(=O)N(Cc2ccc(Cl)cc2)[...,,,22700.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl
2901733,COC(=O)C(c1ccc(Cl)cc1)N1C(=O)c2cc(I)ccc2N(CCCC...,,,32100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl


In [10]:
data.to_csv(DATA_DIR / 'merged_activities.csv', index=False)

# Get PLINDER split

In [None]:
data = pd.read_csv(DATA_DIR / 'merged_activities.csv', low_memory=False)

In [4]:
PATH_TO_PLINDER_ANNOTATION = "gs://plinder/2024-06/v2/index/annotation_table.parquet"
PATH_TO_PLINDER_SPLIT = "gs://plinder/2024-06/v2/splits/split.parquet"

In [5]:
annotation_data = pd.read_parquet(PATH_TO_PLINDER_ANNOTATION)
annotation_data

Unnamed: 0,entry_pdb_id,entry_release_date,entry_oligomeric_state,entry_determination_method,entry_keywords,entry_pH,entry_resolution,entry_validation_resolution,entry_validation_rfree,entry_validation_r,...,system_ligand_has_fragment,system_ligand_has_oligo,system_ligand_has_artifact,system_ligand_has_other,system_ligand_has_covalent,system_ligand_has_invalid,system_ligand_has_ion,system_protein_chains_total_length,system_unique_ccd_codes,system_proper_unique_ccd_codes
0,3grt,1997-02-12,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,8.0,2.50,2.50,,0.1700,...,False,False,False,False,False,False,False,922,FAD,FAD
1,3grt,1997-02-12,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,8.0,2.50,2.50,,0.1700,...,False,True,False,False,False,False,False,922,TS2,TS2
2,3grt,1997-02-12,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,8.0,2.50,2.50,,0.1700,...,False,False,False,False,False,False,False,922,FAD,FAD
3,3grt,1997-02-12,dimeric,X-RAY DIFFRACTION,OXIDOREDUCTASE,8.0,2.50,2.50,,0.1700,...,False,True,False,False,False,False,False,922,TS2,TS2
4,1grx,1993-10-01,monomeric,SOLUTION NMR,ELECTRON TRANSPORT,,,,,,...,False,True,False,False,True,False,False,85,GSH,GSH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1357901,4lpn,2013-07-16,24-meric,X-RAY DIFFRACTION,OXIDOREDUCTASE,9,1.66,1.66,0.1768,0.1586,...,False,False,False,True,False,False,True,528,MG,
1357902,2lp3,2012-01-31,dimeric,SOLUTION NMR,METAL BINDING PROTEIN,,,,,,...,False,False,False,True,False,False,True,93,CA,
1357903,2lp3,2012-01-31,dimeric,SOLUTION NMR,METAL BINDING PROTEIN,,,,,,...,False,False,False,True,False,False,True,93,CA,
1357904,2lp3,2012-01-31,dimeric,SOLUTION NMR,METAL BINDING PROTEIN,,,,,,...,False,False,False,True,False,False,True,93,CA,


In [6]:
split_data = pd.read_parquet(PATH_TO_PLINDER_SPLIT)
split_data

Unnamed: 0,system_id,uniqueness,split,cluster,cluster_for_val_split,system_pass_validation_criteria,system_pass_statistics_criteria,system_proper_num_ligand_chains,system_proper_pocket_num_residues,system_proper_num_interactions,system_proper_ligand_max_molecular_weight,system_has_binding_affinity,system_has_apo_or_pred
0,101m__1__1.A__1.C_1.D,101m__A__C_D_c188899,train,c14,c0,True,True,1,27,20,616.177293,False,False
1,102m__1__1.A__1.C,102m__A__C_c237197,train,c14,c0,True,True,1,26,20,616.177293,False,True
2,103m__1__1.A__1.C_1.D,103m__A__C_D_c252759,train,c14,c0,False,True,1,26,16,616.177293,False,False
3,104m__1__1.A__1.C_1.D,104m__A__C_D_c274687,train,c14,c0,False,True,1,27,21,616.177293,False,False
4,105m__1__1.A__1.C_1.D,105m__A__C_D_c221688,train,c14,c0,False,True,1,28,20,616.177293,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
409721,9xia__1__2.A_4.A__4.B_4.D,9xia__A_A__B_D_c20731,train,c256,c126,False,False,1,23,6,178.084124,False,False
409722,9xim__1__1.A_1.B__1.E_1.F_1.G,9xim__A_B__E_F_G_c240203,train,c256,c126,False,False,1,21,6,150.052823,False,False
409723,9xim__1__1.A_1.B__1.H_1.I_1.J,9xim__A_B__H_I_J_c313183,train,c256,c126,False,False,1,19,5,150.052823,False,False
409724,9xim__1__1.C_1.D__1.K_1.L_1.M,9xim__C_D__K_L_M_c215891,train,c256,c126,False,False,1,20,3,150.052823,False,False


We split activity data into train/val/test based on proteins that are assigned to systems that are split by PLINDER:

In [7]:
# Map system_id to uniprot_id from PLINDER annotation
system2uniprot_id = annotation_data.set_index('system_id')['system_pocket_UniProt'].to_dict()
split_data['uniprot_id'] = split_data['system_id'].map(system2uniprot_id)
# Find split which certain uniprot_id belongs to. Sinse some UniProt IDs appear in multiple sets, we will prioritize 
# the validation set, the training set, and finally the test set to have a larger validation set, 
# and don't include those proteins in the test set.
split_to_idx = {'train': 1, 'val': 0, 'test': 2}
split_data['split_idx'] = split_data['split'].map(split_to_idx)
split_data = split_data.sort_values('split_idx').drop_duplicates('uniprot_id', keep='first')
uniprot_id2split = split_data.set_index('uniprot_id')['split'].to_dict()
data['split'] = data['uniprot_id'].map(uniprot_id2split)

In [8]:
data['split'].value_counts()

split
train      1887882
removed     189500
test        125352
val          56741
Name: count, dtype: int64

In [9]:
# Remove datapoints belonging to the 'removed' split or 
# aren't in the intersection of the merged dataset and the PLINDER dataset
to_remove = (data['split'] == 'removed') | data['split'].isna()
data = data[~to_remove].reset_index(drop=True)
data

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split
0,O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)O)[C@H](Cc2ccccc...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
1,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
2,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
3,O=C1N(CCCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
4,O=C1N(CCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2069970,Clc1ccc2c(C(NCC3CC3)c3cccc4ccccc34)c[nH]c2c1,,,18100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train
2069971,O=C(O)[C@H]1c2ccccc2C(=O)N(Cc2ccc(Cl)cc2)[C@@H...,,18300.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl,train
2069972,COCCNC(=O)[C@H]1c2ccccc2C(=O)N(Cc2ccc(Cl)cc2)[...,,,22700.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train
2069973,COC(=O)C(c1ccc(Cl)cc1)N1C(=O)c2cc(I)ccc2N(CCCC...,,,32100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train


In [10]:
data.to_csv(DATA_DIR / 'merged_activities_split.csv', index=False)

# PAINS filtering

In [3]:
data = pd.read_csv(DATA_DIR / 'merged_activities_split.csv', low_memory=False)

In [4]:
from pains_filter import filter_pains

In [None]:
data = filter_pains(data, 'smiles')

[32m2024-11-14 20:37:57.412[0m | [1mINFO    [0m | [36mpains_filter[0m:[36mfilter_pains[0m:[36m27[0m - [1mMolecules before the PAINS elimination: 2069975[0m
[32m2024-11-14 20:37:57.413[0m | [1mINFO    [0m | [36mpains_filter[0m:[36mfilter_pains[0m:[36m28[0m - [1mConverting molecules into datamol format...[0m
100%|██████████| 2069975/2069975 [04:36<00:00, 7476.40it/s]
[32m2024-11-14 20:42:34.524[0m | [1mINFO    [0m | [36mpains_filter[0m:[36mfilter_pains[0m:[36m31[0m - [1mDone.[0m
[32m2024-11-14 20:42:34.525[0m | [1mINFO    [0m | [36mpains_filter[0m:[36mfilter_pains[0m:[36m32[0m - [1mFiltering PAINS...[0m
  from .autonotebook import tqdm as notebook_tqdm
Processing batches: 100%|██████████| 2070/2070 [21:35<00:00,  1.60it/s]
[32m2024-11-14 21:04:16.293[0m | [1mINFO    [0m | [36mpains_filter[0m:[36mfilter_pains[0m:[36m54[0m - [1mMolecules after the PAINS elimination: 1992096[0m


In [8]:
data.to_csv(DATA_DIR / 'merged_activities_split_nopains.csv', index=False)

# Binarization and deduplication

In [9]:
data = pd.read_csv(DATA_DIR / 'merged_activities_split_nopains.csv', low_memory=False)
data

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split
0,O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)O)[C@H](Cc2ccccc...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
1,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
2,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
3,O=C1N(CCCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
4,O=C1N(CCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1992091,Clc1ccc2c(C(NCC3CC3)c3cccc4ccccc34)c[nH]c2c1,,,18100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train
1992092,O=C(O)[C@H]1c2ccccc2C(=O)N(Cc2ccc(Cl)cc2)[C@@H...,,18300.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl,train
1992093,COCCNC(=O)[C@H]1c2ccccc2C(=O)N(Cc2ccc(Cl)cc2)[...,,,22700.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train
1992094,COC(=O)C(c1ccc(Cl)cc1)N1C(=O)c2cc(I)ccc2N(CCCC...,,,32100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train


Take Ki and Kd values only for validation and test sets as these values are the most accurate, and IC50 has to be ambigously scaled (see [paper](https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0061007&type=printable)).

In [10]:
train = data[data['split'] == 'train'].reset_index(drop=True)
val = data[data['split'] == 'val'].reset_index(drop=True)
test = data[data['split'] == 'test'].reset_index(drop=True)

is_ki_nan = test['ki'].isna()
is_kd_nan = test['kd'].isna()
val = val[~val['ki'].isna() | ~val['kd'].isna()]
test = test[~test['ki'].isna() | ~test['kd'].isna()]
print(f"Val set size after taking Ki and Kd values only: {len(val)}")
print(f"Test set size after taking Ki and Kd values only: {len(test)}")

Val set size after taking Ki and Kd values only: 12131
Test set size after taking Ki and Kd values only: 17180


In [11]:
def binarize_row(row, threshold=1000):
    activity_cols = ['ki', 'kd']
    sign_cols = ['ki_sign', 'kd_sign']
    for activity_col, sign_col in zip(activity_cols, sign_cols):
        value = row[activity_col]
        sign = row[sign_col]
        if pd.notnull(value):
            # We want the activity values to be exclusively compared to the threshold, i.e val < threshold.
            # If the value is given with an equality sign, it can be directly compared to the threshold.
            # For the cases with inequality signs, the values are typically given as > or <, and we need to
            # check if the value is inclusively less or greater than the threshold (e.g. <10000 nM should have 
            # an "active" binary flag).
            if sign == '=':
                return 1 if value < threshold else 0
            elif sign == '>':
                return 0 if value >= threshold else np.nan
            elif sign == '<':
                return 1 if value <= threshold else np.nan
            else:
                return np.nan
    
    return np.nan  # No activity value found

In [12]:
val['is_active'] = val.apply(lambda row: binarize_row(row, threshold=1000), axis=1)
test['is_active'] = test.apply(lambda row: binarize_row(row, threshold=1000), axis=1)

The binary activity label must be consistent for a given pair of (SMILES, sequence). Here is where inconsistency deduplication comes into play:

In [None]:
def is_consistent(group):
    labels = group['is_active'].dropna().unique()
    return len(labels) == 1

# Filter groups with consistent labels
val_consistent_groups = val.groupby(['smiles', 'sequence']).filter(is_consistent)
test_consistent_groups = test.groupby(['smiles', 'sequence']).filter(is_consistent)

# Deduplicate by keeping one row per group
val = val_consistent_groups.groupby(['smiles', 'sequence'], as_index=False).first()
test = test_consistent_groups.groupby(['smiles', 'sequence'], as_index=False).first()

print(f"Val set size after inconsistency deduplication: {len(val)}")
print(f"Test set size after inconsistency deduplication: {len(test)}")

Val set size after inconsistency deduplication: 10338
Test set size after inconsistency deduplication: 14364


In [16]:
val['is_active'].value_counts()

is_active
1.0    5792
0.0    4546
Name: count, dtype: int64

In [17]:
test['is_active'].value_counts()

is_active
1.0    8485
0.0    5879
Name: count, dtype: int64

In [18]:
val = val.drop(columns=['ki', 'kd', 'ki_sign', 'kd_sign', 'ic50', 'ec50', 'ic50_sign', 'ec50_sign'])
test = test.drop(columns=['ki', 'kd', 'ki_sign', 'kd_sign', 'ic50', 'ec50', 'ic50_sign', 'ec50_sign'])

In [19]:
val

Unnamed: 0,smiles,sequence,uniprot_id,source,split,is_active
0,Br[Se]c1ccccc1,MACTIQKAEALDGAHLMQILWYDEEESLYPAVWLRDNCPCSDCYLD...,O75936,bdb,val,0.0
1,Brc1ccc2c(c1)c(OCc1ccc3ccccc3c1)nn2CCN1CCCCC1,MHSKVTIICIRFLFWFLLLCMLIGKSHTEDDIIIATKNGKVRGMNL...,P06276,chembl,val,1.0
2,C#CCN(Cc1ccc(OC)cc1)c1cc(OC[C@H]2C[C@@H]2c2ccc...,MEDGPSNNASCFRRLTECFLSPSLTDEKVKAYLSLHPQVLDEFVSE...,Q9Y233,bdb,val,1.0
3,C#CCN(Cc1ccc2cccc(O)c2n1)C(C#N)CCC1CCN(Cc2cccc...,MHSKVTIICIRFLFWFLLLCMLIGKSHTEDDIIIATKNGKVRGMNL...,P06276,bdb,val,1.0
4,C#CCNC(=N)NCCC[C@H](N)C(=O)O,MAGLGHPAAFGRATHAVVRALPESLGQHALRSAKGEEVDVARAERQ...,O94760,bdb,val,0.0
...,...,...,...,...,...,...
10333,c1ccc2nc(N3CC(Oc4nccnc4-c4ccncc4)C3)ccc2c1,MASLEEPLAPRPQGPLPAAGDEPGCGPGKLRPEPRLSAAGGGSAAG...,Q9Y233,chembl,val,0.0
10334,c1ccc2oc(CNCCCCCCCNc3c4c(nc5ccccc35)CCCC4)cc2c1,MNLLVTSSLGVLLHLVVLCQADDHSELLVNTKSGKVMGTRVPVLSS...,P04058,bdb,val,1.0
10335,c1cnc(N2CCOCC2)c(-c2ccc(OCc3ccc4ccccc4n3)cc2)c1,MRIEERKSQHLTGLTDEKVKAYLSLHPQVLDEFVSESVSAETVEKW...,Q9Y233,bdb,val,1.0
10336,c1cnc2c(c1)CN(c1ncccc1-c1ccc(OCc3ccc4ccccc4n3)...,MRIEERKSQHLTGLTDEKVKAYLSLHPQVLDEFVSESVSAETVEKW...,Q9Y233,bdb,val,1.0


In [20]:
test

Unnamed: 0,smiles,sequence,uniprot_id,source,split,is_active
0,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,Q86WV6,bdb,test,1.0
1,B[P@@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,Q86WV6,bdb,test,1.0
2,B[P@]1(=O)OCC2OC(n3cnc4c(=O)[nH]c(N)nc43)C(O[P...,MPHSSLHPSIPCPRGHGAQKAALVLLSACLVTLWGLGEPPEHTLRY...,Q86WV6,bdb,test,1.0
3,Brc1cccc(COc2ccc(C3C[C@@H]3NCCC3CCCCC3)cc2)c1,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...,P27338,bdb,test,1.0
4,Brc1cccc(COc2ccc3c(c2)OCCO3)c1,MSNKCDVVVVGGGISGMAAAKLLHDSGLNVVVLEARDRVGGRTYTL...,P27338,bdb,test,1.0
...,...,...,...,...,...,...
14359,c1nc(NCC2CCOC2)c2cc(-c3ccc4c(c3)OCO4)ccc2n1,MASSSVPPATVSAATAGPGPGFGFASKTKKKHFVQQKVKVFRAADP...,Q8TBX8,bdb,test,1.0
14360,c1nc2cc(NCc3ccc4c(c3)OCCO4)cnc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,Q16769,bdb,test,1.0
14361,c1nc2cc(NCc3ccsc3)cnc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,Q16769,bdb,test,0.0
14362,c1nc2ccc(-c3nnc(CCc4ccc5c(c4)OCO5)o3)cc2[nH]1,MAGGRHRRVVGTLHLLLLVAALPWASRGVSPSASAWPEEKNYHQPA...,Q16769,bdb,test,1.0


We leave train set as is for people to be able to work with all activity values including Ki, Kd, IC50 and EC50 along with all inequality signs and inconsistencies thus providing a larger training set if this helps their models:

In [None]:
train

Unnamed: 0,smiles,ki,ic50,kd,ec50,sequence,ki_sign,ic50_sign,kd_sign,ec50_sign,uniprot_id,source,split
0,O=C(O)CCCCCN1C(=O)N(CCCCCC(=O)O)[C@H](Cc2ccccc...,0.24,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
1,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.25,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
2,O=C1N(C/C=C/c2cn[nH]c2)[C@H](Cc2ccccc2)[C@H](O...,0.41,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
3,O=C1N(CCCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O...,0.80,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
4,O=C1N(CCCCCO)[C@H](Cc2ccccc2)[C@H](O)[C@@H](O)...,0.99,,,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,=,,,,P03367,bdb,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1818039,Clc1ccc2c(C(NCC3CC3)c3cccc4ccccc34)c[nH]c2c1,,,18100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train
1818040,O=C(O)[C@H]1c2ccccc2C(=O)N(Cc2ccc(Cl)cc2)[C@@H...,,18300.0,,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,=,,,Q00987,chembl,train
1818041,COCCNC(=O)[C@H]1c2ccccc2C(=O)N(Cc2ccc(Cl)cc2)[...,,,22700.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train
1818042,COC(=O)C(c1ccc(Cl)cc1)N1C(=O)c2cc(I)ccc2N(CCCC...,,,32100.0,,MCNTNMSVPTDGAVTTSQIPASEQETLVRPKPLLLKLLKSVGAQKD...,,,=,,Q00987,chembl,train


In [21]:
PLUMBER_DIR = DATA_DIR / 'plumber'
PLUMBER_DIR.mkdir(exist_ok=True, parents=True)

In [23]:
val.to_csv(PLUMBER_DIR / 'val.csv', index=False)
test.to_csv(PLUMBER_DIR / 'test.csv', index=False)
train.to_csv(PLUMBER_DIR / 'train.csv', index=False)