In [3]:
import cx_Oracle as co
co.init_oracle_client(lib_dir=r"C:\oracle\instantclient_19_21")

import os
import pandas as pd
import getpass as gp
#import MolStructuresToSMILES
import time
import QueryBuilding

import sys
from io import StringIO

import rdkit
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, DataStructs, Descriptors, Descriptors3D, PandasTools, Draw
from rdkit.Chem.rdchem import Atom
from rdkit.Chem.MolStandardize import rdMolStandardize

import molvs
from molvs.fragment import LargestFragmentChooser

In [4]:
# Removed casium = 55, keep Silicium (advise from Nina)
# ToDo maybe also discard Na, K, ... as this check happens after keep largest fragemnt!!!
ATOMIC_NO = [1, 5, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53, 3, 11, 19,  12, 20, 30, 29, 50]
ALLOWED_ATOMS = [Atom(i) for i in ATOMIC_NO]
ALLOWED_ATOMS_VALIDATOR = rdMolStandardize.AllowedAtomsValidation(ALLOWED_ATOMS)

def check_allowed_atoms(mol: Chem.Mol) -> bool:
    invalid_atoms = ALLOWED_ATOMS_VALIDATOR.validate(mol)
    return len(invalid_atoms) == 0

def check_only_one_fragment(mol: Chem.rdchem.Mol) -> bool:
    """
    Check if a mol consists only of a single fragment
    :param mol: mol under test
    :return: None if mol consists of multiple fragments, mol otherwise
    """
    fragments = Chem.GetMolFrags(mol, asMols=True)
    return len(fragments) == 1

def check_mol(mol) -> bool:
    
    if mol is None:
        return False
    if not check_only_one_fragment(mol):
        return False
    if not check_allowed_atoms(mol):
        return False
    return True

In [5]:
def SmilesToMol(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
    except:
        mol = None
    return mol

In [6]:
def MolBlockToOriginalSMILES(molblock):
    try:
        mol = Chem.MolFromMolBlock(molblock.read())
    except:
        mol = None
    try:
        smiles = Chem.MolToSmiles(mol, canonical=True)
    except:
        smiles = ''
    return smiles

In [7]:
def standardize_mol_rdkit(smiles):
    # Source: https://github.com/rdkit/rdkit/blob/master/Code/GraphMol/MolStandardize/TransformCatalog/normalizations.in
    
    RDLogger.DisableLog('rdApp.info')  
    
    smiles_std = None
    try:
        mol = Chem.MolFromSmiles(smiles)
        #print(mol.GetNumAtoms())
        if mol is not None:
            Chem.SanitizeMol(mol,sanitizeOps=(Chem.SANITIZE_ALL^Chem.SANITIZE_CLEANUP^Chem.SANITIZE_PROPERTIES))
            cm = rdMolStandardize.Normalize(mol)
        
            uncharger = rdMolStandardize.Uncharger()
            um = uncharger.uncharge(cm)
        
            im = rdMolStandardize.Reionize(um)
        
            lm = rdMolStandardize.FragmentParent(im)

            #te = rdMolStandardize.TautomerEnumerator()  # Kai strongly recommends to NOT use the default canonical_smiles generator from RDkit
            #std_mol = te.Canonicalize(lm)
            
            # get smiles for standardized molecule
            #smiles_std = Chem.MolToSmiles(std_mol, canonical=True)
            smiles_std = Chem.MolToSmiles(lm, canonical=True)
        else:
             smiles_std = 'remove'   
    finally:
        return smiles_std

In [9]:
dfSubs=pd.read_csv('Supp file S2 Result Table 20250904.txt',sep='\t', low_memory=False)
#dfSubs=pd.read_csv('andror_df_all_clusters.csv',sep=',', low_memory=False)
#dfSubs=pd.read_csv('all_AndroR_bind_data_Sept_2024_with_SMILES_from_BioDok.txt',sep='\t', low_memory=False)

In [10]:
dfSubs.head()

Unnamed: 0,flat_smiles,final class,Activity in primary HTS,Activity_0_24,Activity_1_2,Activity_6,Activity_30,MurckoCore,pIC50,Frequent Hitter in HTS,Thiol reactivity,Similarity to most similar counterpart in public domain,Source of most similar counterpart,SMILES of most similar counterpart,ID
0,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,inhibitor,1.5,7.0,3.7,-8.0,-9.1,C1=CC=CC=C1C1C=CNC=C1,>6.6,,no SH-reactivity dtcd,0.24,Pubchem as curated by Lunghini,Oc1ccccc1c2ccc(C#N)c(c2)C(F)(F)F,Mol_1
1,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,inhibitor,0.6,13.1,2.5,-1.5,-6.0,C1CCCN(N1)C1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.39,Pubchem as curated by Lunghini,OC1CC2CCC(C1)N2c3ccc(C#N)c(c3)C(F)(F)F,Mol_2
2,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,inhibitor,,27.4,1.9,-6.7,3.7,S=C1SC=C(C2CC2)N1NC1C=CC=CC=1,>6.6,,,0.19,Compara,Clc1ccc(NC(=N)NC(=N)NCCCCCCNC(=N)NC(=N)Nc2ccc(...,Mol_3
3,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,inhibitor,2.3,16.7,8.2,0.0,-6.6,C1CS/C(/N1)=N/C1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.34,Pubchem as curated by Lunghini,CC(C)Sc1ccc(C#N)c(c1)C(F)(F)F,Mol_4
4,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,inhibitor,-0.3,33.7,6.4,-4.0,-3.7,C1=CC=CC=C1OC1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.65,Compara,[O-][N+](=O)c1ccc(Oc2ccc(Cl)cc2Cl)cc1,Mol_5


In [11]:
start_time = time.time()
dfSubs['SMILES_STD_RDKIT'] = dfSubs['flat_smiles'].apply(standardize_mol_rdkit)
print("--- Chemical standardization with RDkit: %.2f s ---" % (time.time() - start_time))

dfSubs['MOL']=dfSubs['SMILES_STD_RDKIT'].apply(SmilesToMol)

# check for accepted atoms and whether there is only one fragment left
dfSubs['check']=dfSubs['MOL'].apply(check_mol)

dfSubs.to_csv('Supp file S2 Result Table 20250904_standardized_SMILES_RDkit.txt', sep='\t', header=True, index=False)

--- Chemical standardization with RDkit: 48.69 s ---


In [12]:
# discard empty SMILES 
n = dfSubs.shape[0]
data = dfSubs.dropna(subset=['SMILES_STD_RDKIT'], axis=0)
data = data[(data['SMILES_STD_RDKIT'] != '')]
n2 = data.shape[0]
print("Number of discarded compounds without SMILES: "+str(n-n2))
# discard molecules that that don't pass mol_check
data = data[(data["check"] == True)]
n3 = data.shape[0]
print("Number of compounds that didn't pass mol check: "+str(n2-n3))
print('Number of remaining compounds: '+str(n3)) 

# flatten stereochemistry and recompute SMILES

# discard duplicate SMILES with conflicting activity class

Number of discarded compounds without SMILES: 0
Number of compounds that didn't pass mol check: 0
Number of remaining compounds: 24953


In [13]:
def RemoveStereochemistry(mol):
    try:
        Chem.RemoveStereochemistry(mol)
        #print(mol)
        smiles_flat = Chem.MolToSmiles(mol, canonical=True)
        #print(smiles_flat)
        
    except:
        smiles_flat = None
    return smiles_flat

In [14]:
# flatten stereochemistry because we will only use 2D descriptors
data['SMILES_STD_FLAT']=data['MOL'].apply(RemoveStereochemistry)
print(data.shape[0])
data.to_csv('Supp file S2 Result Table 20250904_standardized_SMILES_RDkit_no_stereo.txt', sep='\t', header=True, index=False)


24953


In [15]:
data.head()

Unnamed: 0,flat_smiles,final class,Activity in primary HTS,Activity_0_24,Activity_1_2,Activity_6,Activity_30,MurckoCore,pIC50,Frequent Hitter in HTS,Thiol reactivity,Similarity to most similar counterpart in public domain,Source of most similar counterpart,SMILES of most similar counterpart,ID,SMILES_STD_RDKIT,MOL,check,SMILES_STD_FLAT
0,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,inhibitor,1.5,7.0,3.7,-8.0,-9.1,C1=CC=CC=C1C1C=CNC=C1,>6.6,,no SH-reactivity dtcd,0.24,Pubchem as curated by Lunghini,Oc1ccccc1c2ccc(C#N)c(c2)C(F)(F)F,Mol_1,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1
1,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,inhibitor,0.6,13.1,2.5,-1.5,-6.0,C1CCCN(N1)C1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.39,Pubchem as curated by Lunghini,OC1CC2CCC(C1)N2c3ccc(C#N)c(c3)C(F)(F)F,Mol_2,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1
2,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,inhibitor,,27.4,1.9,-6.7,3.7,S=C1SC=C(C2CC2)N1NC1C=CC=CC=1,>6.6,,,0.19,Compara,Clc1ccc(NC(=N)NC(=N)NCCCCCCNC(=N)NC(=N)Nc2ccc(...,Mol_3,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1
3,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,inhibitor,2.3,16.7,8.2,0.0,-6.6,C1CS/C(/N1)=N/C1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.34,Pubchem as curated by Lunghini,CC(C)Sc1ccc(C#N)c(c1)C(F)(F)F,Mol_4,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1
4,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,inhibitor,-0.3,33.7,6.4,-4.0,-3.7,C1=CC=CC=C1OC1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.65,Compara,[O-][N+](=O)c1ccc(Oc2ccc(Cl)cc2Cl)cc1,Mol_5,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1


In [16]:
# Identify duplicate AndroR measurements per SMILES
data_comb=data
data_comb=data_comb.rename(columns={"final class": "class"})
data_comb['Duplicated'] = data_comb['SMILES_STD_FLAT'].duplicated(keep=False)
print(data_comb.shape)
data_comb_dupl = data_comb[data_comb['Duplicated']==True]
data_comb_no_dupl = data_comb[data_comb['Duplicated']==False]

print(data_comb_dupl.shape)
print(data_comb_no_dupl.shape)
print(data_comb.shape[0] == data_comb_dupl.shape[0] + data_comb_no_dupl.shape[0])

(24953, 20)
(2, 20)
(24951, 20)
True


In [17]:
data_comb.head()

Unnamed: 0,flat_smiles,class,Activity in primary HTS,Activity_0_24,Activity_1_2,Activity_6,Activity_30,MurckoCore,pIC50,Frequent Hitter in HTS,Thiol reactivity,Similarity to most similar counterpart in public domain,Source of most similar counterpart,SMILES of most similar counterpart,ID,SMILES_STD_RDKIT,MOL,check,SMILES_STD_FLAT,Duplicated
0,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,inhibitor,1.5,7.0,3.7,-8.0,-9.1,C1=CC=CC=C1C1C=CNC=C1,>6.6,,no SH-reactivity dtcd,0.24,Pubchem as curated by Lunghini,Oc1ccccc1c2ccc(C#N)c(c2)C(F)(F)F,Mol_1,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,False
1,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,inhibitor,0.6,13.1,2.5,-1.5,-6.0,C1CCCN(N1)C1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.39,Pubchem as curated by Lunghini,OC1CC2CCC(C1)N2c3ccc(C#N)c(c3)C(F)(F)F,Mol_2,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,False
2,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,inhibitor,,27.4,1.9,-6.7,3.7,S=C1SC=C(C2CC2)N1NC1C=CC=CC=1,>6.6,,,0.19,Compara,Clc1ccc(NC(=N)NC(=N)NCCCCCCNC(=N)NC(=N)Nc2ccc(...,Mol_3,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,False
3,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,inhibitor,2.3,16.7,8.2,0.0,-6.6,C1CS/C(/N1)=N/C1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.34,Pubchem as curated by Lunghini,CC(C)Sc1ccc(C#N)c(c1)C(F)(F)F,Mol_4,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,False
4,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,inhibitor,-0.3,33.7,6.4,-4.0,-3.7,C1=CC=CC=C1OC1C=CC=CC=1,>6.6,,no SH-reactivity dtcd,0.65,Compara,[O-][N+](=O)c1ccc(Oc2ccc(Cl)cc2Cl)cc1,Mol_5,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,False


In [18]:
# condition 1: CHeck if all values for "class" are identical for a SMILES (i.e. nunique = 1) or not: Gibt es widersprüchliche Einträge unter den Duplikaten? 
data_comb = data_comb.sort_values(by = 'SMILES_STD_FLAT')
bools = data_comb.groupby(by = 'SMILES_STD_FLAT').apply(lambda x: x['class'].nunique()==1)

# for duplicate SMILES none is of class = "conflict" (manually checked in Spotfire!) -> we don't have to worry about this special case :-

data_comb_cond1 = data_comb_dupl.merge(bools.to_frame(name='is_unique'), how='left', on='SMILES_STD_FLAT')

# widersprüchliche Einträge:
data_comb_contra = data_comb_cond1[data_comb_cond1['is_unique']==False]
# Duplikate mit identischem Ergebnis
data_comb_equal = data_comb_cond1[data_comb_cond1['is_unique']==True]

print(data_comb_contra.shape)
print(data_comb_equal.shape)
print(data_comb_dupl.shape[0] == data_comb_contra.shape[0] + data_comb_equal.shape[0])

# for 57 duplicate SMILES with matching results keep only first. 
data_comb_equal_drop = data_comb_equal.drop_duplicates(subset='SMILES_STD_FLAT')

print(data_comb_equal_drop.shape)

(0, 21)
(2, 21)
True
(1, 21)


In [19]:
# Final table: Append results without duplicates with the cleaned duplicates. Do NOT add df_contra (as these contain contradictory results)

print(data_comb_no_dupl.shape[0])
print(data_comb_equal_drop.shape[0])

#final_data = data_comb_no_dupl.append(data_comb_equal_drop)
final_data = pd.concat([data_comb_no_dupl, data_comb_equal_drop], ignore_index=True)

print(final_data.shape)
print(final_data.shape[0] == data_comb_no_dupl.shape[0] + data_comb_equal_drop.shape[0])

final_data.to_csv('Supp file S2 Result Table 20250904_standardized_no_stereo_unique_SMILES.txt', sep='\t', header=True, index=False)

24951
1
(24952, 21)
True


In [20]:
final_data.head()

Unnamed: 0,flat_smiles,class,Activity in primary HTS,Activity_0_24,Activity_1_2,Activity_6,Activity_30,MurckoCore,pIC50,Frequent Hitter in HTS,...,Similarity to most similar counterpart in public domain,Source of most similar counterpart,SMILES of most similar counterpart,ID,SMILES_STD_RDKIT,MOL,check,SMILES_STD_FLAT,Duplicated,is_unique
0,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,inhibitor,1.5,7.0,3.7,-8.0,-9.1,C1=CC=CC=C1C1C=CNC=C1,>6.6,,...,0.24,Pubchem as curated by Lunghini,Oc1ccccc1c2ccc(C#N)c(c2)C(F)(F)F,Mol_1,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CC1=C(C#N)C(c2ccccc2C(F)(F)F)C=C(C(F)(F)F)N1,False,
1,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,inhibitor,0.6,13.1,2.5,-1.5,-6.0,C1CCCN(N1)C1C=CC=CC=1,>6.6,,...,0.39,Pubchem as curated by Lunghini,OC1CC2CCC(C1)N2c3ccc(C#N)c(c3)C(F)(F)F,Mol_2,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CCN1CCCCN1c1ccc(C#N)c(C(F)(F)F)c1,False,
2,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,inhibitor,,27.4,1.9,-6.7,3.7,S=C1SC=C(C2CC2)N1NC1C=CC=CC=1,>6.6,,...,0.19,Compara,Clc1ccc(NC(=N)NC(=N)NCCCCCCNC(=N)NC(=N)Nc2ccc(...,Mol_3,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,S=c1scc(C2(Cl)CC2)n1Nc1ccc(Cl)cc1,False,
3,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,inhibitor,2.3,16.7,8.2,0.0,-6.6,C1CS/C(/N1)=N/C1C=CC=CC=1,>6.6,,...,0.34,Pubchem as curated by Lunghini,CC(C)Sc1ccc(C#N)c(c1)C(F)(F)F,Mol_4,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,CC(C)N1CCSC1=Nc1ccc(C#N)c(C(F)(F)F)c1,False,
4,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,inhibitor,-0.3,33.7,6.4,-4.0,-3.7,C1=CC=CC=C1OC1C=CC=CC=1,>6.6,,...,0.65,Compara,[O-][N+](=O)c1ccc(Oc2ccc(Cl)cc2Cl)cc1,Mol_5,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,<rdkit.Chem.rdchem.Mol object at 0x00000216820...,True,O=[N+]([O-])c1ccc(Oc2ccc(O)cc2Cl)cc1,False,
