In [1]:
import os
import pandas as pd
import getpass as gp
import time
import QueryBuilding

import sys
from io import StringIO

import rdkit
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, DataStructs, Descriptors, Descriptors3D, PandasTools, Draw
from rdkit.Chem.rdchem import Atom
from rdkit.Chem.MolStandardize import rdMolStandardize

import molvs
from molvs.fragment import LargestFragmentChooser

In [2]:
# Removed casium = 55, keep Silicium
ATOMIC_NO = [1, 5, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53, 3, 11, 19,  12, 20, 30, 29, 50]
ALLOWED_ATOMS = [Atom(i) for i in ATOMIC_NO]
ALLOWED_ATOMS_VALIDATOR = rdMolStandardize.AllowedAtomsValidation(ALLOWED_ATOMS)

def check_allowed_atoms(mol: Chem.Mol) -> bool:
    invalid_atoms = ALLOWED_ATOMS_VALIDATOR.validate(mol)
    return len(invalid_atoms) == 0

def check_only_one_fragment(mol: Chem.rdchem.Mol) -> bool:
    """
    Check if a mol consists only of a single fragment
    :param mol: mol under test
    :return: None if mol consists of multiple fragments, mol otherwise
    """
    fragments = Chem.GetMolFrags(mol, asMols=True)
    return len(fragments) == 1

def check_mol(mol) -> bool:
    
    if mol is None:
        return False
    if not check_only_one_fragment(mol):
        return False
    if not check_allowed_atoms(mol):
        return False
    return True

In [3]:
def SmilesToMol(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
    except:
        mol = None
    return mol

In [4]:
def MolBlockToOriginalSMILES(molblock):
    try:
        mol = Chem.MolFromMolBlock(molblock.read())
    except:
        mol = None
    try:
        smiles = Chem.MolToSmiles(mol, canonical=True)
    except:
        smiles = ''
    return smiles

In [5]:
def standardize_mol_rdkit(smiles):
    # Source: https://github.com/rdkit/rdkit/blob/master/Code/GraphMol/MolStandardize/TransformCatalog/normalizations.in
    
    RDLogger.DisableLog('rdApp.info')  
    
    smiles_std = None
    try:
        mol = Chem.MolFromSmiles(smiles)
        #print(mol.GetNumAtoms())
        if mol is not None:
            Chem.SanitizeMol(mol,sanitizeOps=(Chem.SANITIZE_ALL^Chem.SANITIZE_CLEANUP^Chem.SANITIZE_PROPERTIES))
            cm = rdMolStandardize.Normalize(mol)
        
            uncharger = rdMolStandardize.Uncharger()
            um = uncharger.uncharge(cm)
        
            im = rdMolStandardize.Reionize(um)
        
            lm = rdMolStandardize.FragmentParent(im)

            smiles_std = Chem.MolToSmiles(lm, canonical=True)
        else:
             smiles_std = 'remove'   
    finally:
        return smiles_std

In [6]:
dfSubs=pd.read_csv('Supp file S2 Result Table 20250904.txt',sep='\t', low_memory=False)


In [7]:
dfSubs.head()

Unnamed: 0,flat_smiles,final class,Activity in primary HTS,Activity_0_24,Activity_1_2,Activity_6,Activity_30,MurckoCore,pIC50,Frequent Hitter in HTS,Thiol reactivity,Similarity to most similar counterpart in public domain,Source of most similar counterpart,SMILES of most similar counterpart,ID
0,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,inhibitor,1.5,7.0,3.7,-8.0,-9.1,C1=CC=CC=C1C1C=CNC=C1,>6.6,,no SH-reactivity dtcd,0.24,Pubchem as curated by Lunghini,Oc1ccccc1c2ccc(C#N)c(c2)C(F)(F)F,Mol_1
1,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,inhibitor,0.6,13.1,2.5,-1.5,-6.0,C1CCCN(N1)C1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.39,Pubchem as curated by Lunghini,OC1CC2CCC(C1)N2c3ccc(C#N)c(c3)C(F)(F)F,Mol_2
2,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,inhibitor,,27.4,1.9,-6.7,3.7,S=C1SC=C(C2CC2)N1NC1C=CC=CC=1,>6.6,,,0.19,Compara,Clc1ccc(NC(=N)NC(=N)NCCCCCCNC(=N)NC(=N)Nc2ccc(...,Mol_3
3,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,inhibitor,2.3,16.7,8.2,0.0,-6.6,C1CS/C(/N1)=N/C1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.34,Pubchem as curated by Lunghini,CC(C)Sc1ccc(C#N)c(c1)C(F)(F)F,Mol_4
4,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,inhibitor,-0.3,33.7,6.4,-4.0,-3.7,C1=CC=CC=C1OC1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.65,Compara,[O-][N+](=O)c1ccc(Oc2ccc(Cl)cc2Cl)cc1,Mol_5


In [10]:
start_time = time.time()
dfSubs['SMILES_STD_RDKIT'] = dfSubs['Structure'].apply(standardize_mol_rdkit)
print("--- Chemical standardization with RDkit: %.2f s ---" % (time.time() - start_time))

dfSubs['MOL']=dfSubs['SMILES_STD_RDKIT'].apply(SmilesToMol)

# check for accepted atoms and whether there is only one fragment left
dfSubs['check']=dfSubs['MOL'].apply(check_mol)

dfSubs.to_csv('Supp file S2 Result Table 20250904_standardized_SMILES_RDkit.txt', sep='\t', header=True, index=False)

--- Chemical standardization with RDkit: 135.07 s ---


In [11]:
# discard empty SMILES 
n = dfSubs.shape[0]
data = dfSubs.dropna(subset=['SMILES_STD_RDKIT'], axis=0)
data = data[(data['SMILES_STD_RDKIT'] != '')]
n2 = data.shape[0]
print("Number of discarded compounds without SMILES: "+str(n-n2))
# discard molecules that that don't pass mol_check
data = data[(data["check"] == True)]
n3 = data.shape[0]
print("Number of compounds that didn't pass mol check: "+str(n2-n3))
print('Number of remaining compounds: '+str(n3)) 


Number of discarded compounds without SMILES: 0
Number of compounds that didn't pass mol check: 0
Number of remaining compounds: 72686


In [12]:
def RemoveStereochemistry(mol):
    try:
        Chem.RemoveStereochemistry(mol)
        smiles_flat = Chem.MolToSmiles(mol, canonical=True)
        
    except:
        smiles_flat = None
    return smiles_flat

In [16]:
# flatten stereochemistry because we will only use 2D descriptors
data['SMILES_STD_FLAT']=data['MOL'].apply(RemoveStereochemistry)
print(data.shape[0])
data.to_csv('Supp file S2 Result Table 20250904_standardized_SMILES_RDkit_no_stereo.txt', sep='\t', header=True, index=False)


72686


In [17]:
data.head()

Unnamed: 0,Compound No,Structure,final class,Activity in HTS,mean_0_24,mean_1_2,mean_6,mean_30,AUC,weighted_mean,...,publishable,4-conc,origin,manually_remove,std_smiles,flat_smiles,SMILES_STD_RDKIT,MOL,check,SMILES_STD_FLAT
0,BCS-AB25803,O=C(CSC1NN=CN=1)C1C=CC(F)=CC=1F,inactive,8558558559,1168,1196,1194,1082,3418272,1104153846,...,YES,1,add,no,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,<rdkit.Chem.rdchem.Mol object at 0x000001AADD5...,True,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F
1,BCS-AB45447,N#CC1=NC2C=C(C=C(C=2S1)[N+]([O-])=O)C(F)(F)F,inactive,900990099,1037,1136,963,976,2934864,9794358974,...,YES,1,add,no,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,<rdkit.Chem.rdchem.Mol object at 0x000001AADC9...,True,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1
2,BCS-AC17367,CC(=O)NSC1SC(Cl)=C(Cl)C=1Cl,inactive,8738738739,1024,1104,947,816,2709984,8475576923,...,YES,1,add,no,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,<rdkit.Chem.rdchem.Mol object at 0x000001AADC9...,True,CC(=O)NSc1sc(Cl)c(Cl)c1Cl
3,BCS-AC17565,N#C/C(/Cl)=C(\Cl)/S/C(/Cl)=C(\Cl)/C#N,inactive,8258928571,112,1086,1039,997,3059088,1007371795,...,YES,1,add,no,N#C/C(Cl)=C(\Cl)S/C(Cl)=C(\Cl)C#N,N#CC(Cl)=C(Cl)SC(Cl)=C(Cl)C#N,N#C/C(Cl)=C(\Cl)S/C(Cl)=C(\Cl)C#N,<rdkit.Chem.rdchem.Mol object at 0x000001AADD6...,True,N#CC(Cl)=C(Cl)SC(Cl)=C(Cl)C#N
4,BCS-AC41114,O=C1C(=CNN1C1=CC=CC=N1)C1=CC=CC=C1F,inactive,1013392857,1002,1004,1099,1054,3184608,1059275641,...,YES,1,add,no,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,<rdkit.Chem.rdchem.Mol object at 0x000001AADDD...,True,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1


In [18]:
# Identify duplicate AndroR measurements per SMILES
data_comb=data
data_comb=data_comb.rename(columns={"final class": "class"})
data_comb['Duplicated'] = data_comb['SMILES_STD_FLAT'].duplicated(keep=False)
print(data_comb.shape)
data_comb_dupl = data_comb[data_comb['Duplicated']==True]
data_comb_no_dupl = data_comb[data_comb['Duplicated']==False]

print(data_comb_dupl.shape)
print(data_comb_no_dupl.shape)
print(data_comb.shape[0] == data_comb_dupl.shape[0] + data_comb_no_dupl.shape[0])

(72686, 22)
(0, 22)
(72686, 22)
True


In [19]:
data_comb.head()

Unnamed: 0,Compound No,Structure,class,Activity in HTS,mean_0_24,mean_1_2,mean_6,mean_30,AUC,weighted_mean,...,4-conc,origin,manually_remove,std_smiles,flat_smiles,SMILES_STD_RDKIT,MOL,check,SMILES_STD_FLAT,Duplicated
0,BCS-AB25803,O=C(CSC1NN=CN=1)C1C=CC(F)=CC=1F,inactive,8558558559,1168,1196,1194,1082,3418272,1104153846,...,1,add,no,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,<rdkit.Chem.rdchem.Mol object at 0x000001AADD5...,True,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,False
1,BCS-AB45447,N#CC1=NC2C=C(C=C(C=2S1)[N+]([O-])=O)C(F)(F)F,inactive,900990099,1037,1136,963,976,2934864,9794358974,...,1,add,no,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,<rdkit.Chem.rdchem.Mol object at 0x000001AADC9...,True,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,False
2,BCS-AC17367,CC(=O)NSC1SC(Cl)=C(Cl)C=1Cl,inactive,8738738739,1024,1104,947,816,2709984,8475576923,...,1,add,no,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,<rdkit.Chem.rdchem.Mol object at 0x000001AADC9...,True,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,False
3,BCS-AC17565,N#C/C(/Cl)=C(\Cl)/S/C(/Cl)=C(\Cl)/C#N,inactive,8258928571,112,1086,1039,997,3059088,1007371795,...,1,add,no,N#C/C(Cl)=C(\Cl)S/C(Cl)=C(\Cl)C#N,N#CC(Cl)=C(Cl)SC(Cl)=C(Cl)C#N,N#C/C(Cl)=C(\Cl)S/C(Cl)=C(\Cl)C#N,<rdkit.Chem.rdchem.Mol object at 0x000001AADD6...,True,N#CC(Cl)=C(Cl)SC(Cl)=C(Cl)C#N,False
4,BCS-AC41114,O=C1C(=CNN1C1=CC=CC=N1)C1=CC=CC=C1F,inactive,1013392857,1002,1004,1099,1054,3184608,1059275641,...,1,add,no,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,<rdkit.Chem.rdchem.Mol object at 0x000001AADDD...,True,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,False


In [20]:
# condition 1: CHeck if all values for "class" are identical for a SMILES (i.e. nunique = 1) or not
data_comb = data_comb.sort_values(by = 'SMILES_STD_FLAT')
bools = data_comb.groupby(by = 'SMILES_STD_FLAT').apply(lambda x: x['class'].nunique()==1)

data_comb_cond1 = data_comb_dupl.merge(bools.to_frame(name='is_unique'), how='left', on='SMILES_STD_FLAT')

data_comb_contra = data_comb_cond1[data_comb_cond1['is_unique']==False]
data_comb_equal = data_comb_cond1[data_comb_cond1['is_unique']==True]

print(data_comb_contra.shape)
print(data_comb_equal.shape)
print(data_comb_dupl.shape[0] == data_comb_contra.shape[0] + data_comb_equal.shape[0])

# for duplicate SMILES with matching results keep only first. 
data_comb_equal_drop = data_comb_equal.drop_duplicates(subset='SMILES_STD_FLAT')

print(data_comb_equal_drop.shape)

(0, 23)
(0, 23)
True
(0, 23)


In [21]:
# Final table: Append results without duplicates with the cleaned duplicates. Do NOT add df_contra (as these contain contradictory results)

print(data_comb_no_dupl.shape[0])
print(data_comb_equal_drop.shape[0])

#final_data = data_comb_no_dupl.append(data_comb_equal_drop)
final_data = pd.concat([data_comb_no_dupl, data_comb_equal_drop], ignore_index=True)

print(final_data.shape)
print(final_data.shape[0] == data_comb_no_dupl.shape[0] + data_comb_equal_drop.shape[0])

final_data.to_csv('Supp file S2 Result Table 20250904_standardized_no_stereo_unique_SMILES.txt', sep='\t', header=True, index=False)

72686
0
(72686, 23)
True


In [22]:
final_data.head()

Unnamed: 0,Compound No,Structure,class,Activity in HTS,mean_0_24,mean_1_2,mean_6,mean_30,AUC,weighted_mean,...,origin,manually_remove,std_smiles,flat_smiles,SMILES_STD_RDKIT,MOL,check,SMILES_STD_FLAT,Duplicated,is_unique
0,BCS-AB25803,O=C(CSC1NN=CN=1)C1C=CC(F)=CC=1F,inactive,8558558559,1168,1196,1194,1082,3418272,1104153846,...,add,no,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,<rdkit.Chem.rdchem.Mol object at 0x000001AADD5...,True,O=C(CSc1ncn[nH]1)c1ccc(F)cc1F,False,
1,BCS-AB45447,N#CC1=NC2C=C(C=C(C=2S1)[N+]([O-])=O)C(F)(F)F,inactive,900990099,1037,1136,963,976,2934864,9794358974,...,add,no,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,<rdkit.Chem.rdchem.Mol object at 0x000001AADC9...,True,N#Cc1nc2cc(C(F)(F)F)cc([N+](=O)[O-])c2s1,False,
2,BCS-AC17367,CC(=O)NSC1SC(Cl)=C(Cl)C=1Cl,inactive,8738738739,1024,1104,947,816,2709984,8475576923,...,add,no,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,<rdkit.Chem.rdchem.Mol object at 0x000001AADC9...,True,CC(=O)NSc1sc(Cl)c(Cl)c1Cl,False,
3,BCS-AC17565,N#C/C(/Cl)=C(\Cl)/S/C(/Cl)=C(\Cl)/C#N,inactive,8258928571,112,1086,1039,997,3059088,1007371795,...,add,no,N#C/C(Cl)=C(\Cl)S/C(Cl)=C(\Cl)C#N,N#CC(Cl)=C(Cl)SC(Cl)=C(Cl)C#N,N#C/C(Cl)=C(\Cl)S/C(Cl)=C(\Cl)C#N,<rdkit.Chem.rdchem.Mol object at 0x000001AADD6...,True,N#CC(Cl)=C(Cl)SC(Cl)=C(Cl)C#N,False,
4,BCS-AC41114,O=C1C(=CNN1C1=CC=CC=N1)C1=CC=CC=C1F,inactive,1013392857,1002,1004,1099,1054,3184608,1059275641,...,add,no,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,<rdkit.Chem.rdchem.Mol object at 0x000001AADDD...,True,O=c1c(-c2ccccc2F)c[nH]n1-c1ccccn1,False,
