In [29]:
import os
import pandas as pd
import getpass as gp
#import MolStructuresToSMILES
import time
import QueryBuilding

import sys
from io import StringIO

import rdkit
from rdkit import Chem, RDLogger
from rdkit.Chem import AllChem, DataStructs, Descriptors, Descriptors3D, PandasTools, Draw
from rdkit.Chem.rdchem import Atom
from rdkit.Chem.MolStandardize import rdMolStandardize


In [30]:
ATOMIC_NO = [1, 5, 6, 7, 8, 9, 14, 15, 16, 17, 35, 53, 3, 11, 19,  12, 20, 30, 29, 50]
ALLOWED_ATOMS = [Atom(i) for i in ATOMIC_NO]
ALLOWED_ATOMS_VALIDATOR = rdMolStandardize.AllowedAtomsValidation(ALLOWED_ATOMS)

def check_allowed_atoms(mol: Chem.Mol) -> bool:
    invalid_atoms = ALLOWED_ATOMS_VALIDATOR.validate(mol)
    return len(invalid_atoms) == 0

def check_only_one_fragment(mol: Chem.rdchem.Mol) -> bool:
    """
    Check if a mol consists only of a single fragment
    :param mol: mol under test
    :return: None if mol consists of multiple fragments, mol otherwise
    """
    fragments = Chem.GetMolFrags(mol, asMols=True)
    return len(fragments) == 1

def check_mol(mol) -> bool:
    
    if mol is None:
        return False
    if not check_only_one_fragment(mol):
        return False
    if not check_allowed_atoms(mol):
        return False
    return True

In [31]:
def standardize_mol_rdkit(smiles):
    # Source: https://github.com/rdkit/rdkit/blob/master/Code/GraphMol/MolStandardize/TransformCatalog/normalizations.in
    
    RDLogger.DisableLog('rdApp.info')  
    
    smiles_std = None
    try:
        mol = Chem.MolFromSmiles(smiles)
        #print(mol.GetNumAtoms())
        if mol is not None:
            Chem.SanitizeMol(mol,sanitizeOps=(Chem.SANITIZE_ALL^Chem.SANITIZE_CLEANUP^Chem.SANITIZE_PROPERTIES))
            cm = rdMolStandardize.Normalize(mol)
        
            uncharger = rdMolStandardize.Uncharger()
            um = uncharger.uncharge(cm)        
            im = rdMolStandardize.Reionize(um)        
            lm = rdMolStandardize.FragmentParent(im)
            smiles_std = Chem.MolToSmiles(lm, canonical=True)
        else:
             smiles_std = 'remove'   
    finally:
        return smiles_std

In [32]:
dfSubs=pd.read_csv('AR_binding_CoMPARA_raw.csv',sep=',', low_memory=False,header=None)

In [33]:
start_time = time.time()
dfSubs['SMILES_STD_RDKIT'] = dfSubs[5].apply(standardize_mol_rdkit)
print("--- Chemical standardization with RDkit: %.2f s ---" % (time.time() - start_time))

dfSubs['MOL']=dfSubs['SMILES_STD_RDKIT'].apply(SmilesToMol)

# check for accepted atoms and whether there is only one fragment left
dfSubs['check']=dfSubs['MOL'].apply(check_mol)

dfSubs.to_csv('AR_binding_CoMPARA_raw_standardized_SMILES_RDkit.txt', sep='\t', header=True, index=False)

--- Chemical standardization with RDkit: 2.24 s ---


In [34]:
# discard empty SMILES 
n = dfSubs.shape[0]
data = dfSubs.dropna(subset=['SMILES_STD_RDKIT'], axis=0)
data = data[(data['SMILES_STD_RDKIT'] != '')]
n2 = data.shape[0]
print("Number of discarded compounds without SMILES: "+str(n-n2))
# discard molecules that that don't pass mol_check
data = data[(data["check"] == True)]
n3 = data.shape[0]
print("Number of compounds that didn't pass mol check: "+str(n2-n3))
print('Number of remaining compounds: '+str(n3)) 


Number of discarded compounds without SMILES: 0
Number of compounds that didn't pass mol check: 0
Number of remaining compounds: 1689


In [35]:
def RemoveStereochemistry(mol):
    try:
        Chem.RemoveStereochemistry(mol)
        #print(mol)
        smiles_flat = Chem.MolToSmiles(mol, canonical=True)
        #print(smiles_flat)
        
    except:
        smiles_flat = None
    return smiles_flat

In [36]:
# flatten stereochemistry because we will only use 2D descriptors
data['SMILES_STD_FLAT']=data['MOL'].apply(RemoveStereochemistry)
print(data.shape[0])

1689


In [37]:
# Identify duplicate AndroR measurements per SMILES
data_comb=data
data_comb=data_comb.rename(columns={10: "class"})
data_comb['Duplicated'] = data_comb['SMILES_STD_FLAT'].duplicated(keep=False)
print(data_comb.shape)
data_comb_dupl = data_comb[data_comb['Duplicated']==True]
data_comb_no_dupl = data_comb[data_comb['Duplicated']==False]

print(data_comb_dupl.shape)
print(data_comb_no_dupl.shape)
print(data_comb.shape[0] == data_comb_dupl.shape[0] + data_comb_no_dupl.shape[0])

(1689, 16)
(2, 16)
(1687, 16)
True


In [38]:
# condition 1: Check if all values for "class" are identical for a SMILES (i.e. nunique = 1) or not
data_comb = data_comb.sort_values(by = 'SMILES_STD_FLAT')
bools = data_comb.groupby(by = 'SMILES_STD_FLAT').apply(lambda x: x['class'].nunique()==1)

data_comb_cond1 = data_comb_dupl.merge(bools.to_frame(name='is_unique'), how='left', on='SMILES_STD_FLAT')

# contradictory instances
data_comb_contra = data_comb_cond1[data_comb_cond1['is_unique']==False]
# duplicates with same label
data_comb_equal = data_comb_cond1[data_comb_cond1['is_unique']==True]

print(data_comb_contra.shape)
print(data_comb_equal.shape)
print(data_comb_dupl.shape[0] == data_comb_contra.shape[0] + data_comb_equal.shape[0])

data_comb_equal_drop = data_comb_equal.drop_duplicates(subset='SMILES_STD_FLAT')

print(data_comb_equal_drop.shape)

(2, 17)
(0, 17)
True
(0, 17)


In [39]:
# Final table: Append results without duplicates with the cleaned duplicates. Do NOT add df_contra (as these contain contradictory results)

print(data_comb_no_dupl.shape[0])
print(data_comb_equal_drop.shape[0])

final_data = pd.concat([data_comb_no_dupl, data_comb_equal_drop], ignore_index=True)

print(final_data.shape)
print(final_data.shape[0] == data_comb_no_dupl.shape[0] + data_comb_equal_drop.shape[0])

final_data.to_csv('AR_binding_CoMPARA_raw_no_stereo_unique_SMILES.txt', sep='\t', header=True, index=False)

1687
0
(1687, 17)
True
