In [1]:
import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import PandasTools

from UpdateDB import Connect_CII as connector
from UpdateDB import Dataset_selection as datasel


*** CompoundDB module found. Will check the synonyms table to resolve CAS. ***



In [2]:
conn = connector.Connector(host = '', dbname='', user='', password='')
conn.open_connection()

<connection object at 0x7f85103bf048; dsn: 'user=postgres password=xxx dbname=cii_pharos_updated host=localhost', closed: 0>

In [3]:
sub_df = pd.read_sql_query("""SELECT class_name_curated, preferred_name_curated, mol_formula_curated, 
                            str."structure_curated", st.type, ep.cmr, ep.pbt, ep.vpvb, 
                            ep.sensitiser, ep.endocrine_disruptor
                            FROM substance sub
                            left join substance_structure str on str.subs_id = sub.id
                            left join endpoint_annotation ep on ep.subs_id = sub.id
                            left join substance_type st on st.id = str.substance_type_id
                            where str."structure_curated" is not null  
                            order by sub.id ASC""", conn.conn)
sub_df.loc[sub_df['preferred_name_curated'].isna(),'preferred_name_curated'] = sub_df.loc[sub_df['preferred_name_curated'].isna(),'class_name_curated']
sub_df.drop('class_name_curated', axis=1, inplace=True)
sub_df.rename(columns={'preferred_name_curated':'name'},inplace = True)

In [4]:
sub_df = sub_df.loc[~sub_df['type'].isin(['organometallic','no_sanitizable', 'inorganic_salt', 'inorganic','inorganic_metal'])]

In [5]:
ed = sub_df[['name','mol_formula_curated','structure_curated','endocrine_disruptor']]

In [6]:
ed.loc[ed['endocrine_disruptor'].isin(['YES','Pending']), 'activity'] = 1
ed.loc[ed['endocrine_disruptor'].isin(['NO','No information']), 'activity'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [7]:
ed['activity'] = ed['activity'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [8]:
def canon_smi(x):
    try:
        cs = Chem.CanonSmiles(x)
    except:
        cs = None
    return cs

In [9]:
ed.loc[:, 'canon_smiles'] = ed.loc[:,'structure_curated'].apply(lambda x: canon_smi(x))

RDKit ERROR: [16:32:25] Explicit valence for atom # 12 O, 3, is greater than permitted
RDKit ERROR: [16:32:25] Explicit valence for atom # 11 O, 3, is greater than permitted


In [10]:
ed.loc[~ed['canon_smiles'].isna(),'mols_rdkit'] = ed.loc[~ed['canon_smiles'].isna(),'canon_smiles'].apply(lambda x: Chem.MolFromSmiles(x))



In [11]:
ed.loc[~ed['canon_smiles'].isna(),'fps'] = ed.loc[~ed['canon_smiles'].isna(),'mols_rdkit'].apply(lambda x: FingerprintMols.FingerprintMol(x))

In [12]:
clean_ed = ed[~ed['fps'].isna()]

In [13]:
clean_ed.head()

Unnamed: 0,name,mol_formula_curated,structure_curated,endocrine_disruptor,activity,canon_smiles,mols_rdkit,fps
0,Formaldehyde,CH2O,OCO,No information,0,OCO,"<img data-content=""rdkit/molecule"" src=""data:i...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Formaldehyde,CH2O,C=O,No information,0,C=O,"<img data-content=""rdkit/molecule"" src=""data:i...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"N,N-Dimethylformamide",C3H7NO,CN(C)C=O,YES,1,CN(C)C=O,"<img data-content=""rdkit/molecule"" src=""data:i...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3,Quinoline,C9H7N,c1ccc2ncccc2c1,No information,0,c1ccc2ncccc2c1,"<img data-content=""rdkit/molecule"" src=""data:i...","[1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, ..."
4,N-Nitrosodimethylamine,C2H6N2O,CN(C)N=O,No information,0,CN(C)N=O,"<img data-content=""rdkit/molecule"" src=""data:i...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ..."


In [14]:
len(clean_ed[clean_ed['activity'] == 1])

187

In [15]:
%%time
comparison_dict = {'name':[],'name_structure':[], 'target_name':[],'target_structure':[],'activity':[],'target_activity':[],'similarity':[]}
index_to_avoid = []
for i, row in clean_ed.iterrows():
    name = row['name']
    struc = row['canon_smiles']
    activity = row['activity']
    fps = row['fps']
    index_to_avoid.append(i)
    fps_to_compare = clean_ed.loc[~clean_ed.index.isin(index_to_avoid),'fps'].values
    index_to_consider = clean_ed.loc[~clean_ed.index.isin(index_to_avoid),:].index
    try:
        s = DataStructs.BulkTanimotoSimilarity(fps, fps_to_compare)
    except TypeError:
        print(clean_ed[clean_ed.index.isin([i])])
        raise
    for sim,idx in zip(s,index_to_consider):
        comparison_dict['name'].append(name)
        comparison_dict['name_structure'].append(struc)
        comparison_dict['target_name'].append(clean_ed.loc[clean_ed.index.isin([idx]),'name'].values[0])
        comparison_dict['target_structure'].append(clean_ed.loc[clean_ed.index.isin([idx]),'canon_smiles'].values[0])
        comparison_dict['activity'].append(activity)
        comparison_dict['target_activity'].append(clean_ed.loc[clean_ed.index.isin([idx]),'activity'].values[0])
        comparison_dict['similarity'].append(sim)

<rdkit.DataStructs.cDataStructs.ExplicitBitVect object at 0x7f850fa43120>
CPU times: user 5.87 ms, sys: 1.04 ms, total: 6.9 ms
Wall time: 5.57 ms


In [16]:
comp_df = pd.DataFrame(data=comparison_dict)
comp_df = comp_df.sort_values(by=['name','similarity'], ascending=False)

In [22]:
comp_df.to_pickle('ED_similarity.pkl')

In [23]:
comp_df = pd.read_pickle('ED_similarity.pkl')

In [24]:
comp_df.head()

Unnamed: 0,name,name_structure,target_name,target_structure,activity,target_activity,similarity
649446,γ-HCH or γ-BHC,ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,"(1α,2β,3α,4β,5α,6β)-1,2,3,4,5,6-Hexachlorocycl...",ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,1,1,1.0
649447,γ-HCH or γ-BHC,ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,"(1α,2α,3α,4β,5α,6β)-1,2,3,4,5,6-Hexachlorocycl...",ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,1,1,1.0
652070,γ-HCH or γ-BHC,ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,Chlorinated paraffins,CCCC(Cl)CCCC(Cl)CCC(Cl)CCC(Cl)CCC(Cl)CCCC(Cl)CCC,1,0,0.589041
652363,γ-HCH or γ-BHC,ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,chlorocyclohexane,ClC1CCCCC1,1,0,0.588235
651334,γ-HCH or γ-BHC,ClC1C(Cl)C(Cl)C(Cl)C(Cl)C1Cl,"2-Hexanone, 6-chloro-",CC(=O)CCCCCl,1,0,0.566038


In [25]:
len(comp_df.loc[comp_df['activity'] == 1, 'name'].drop_duplicates())

173

In [26]:
len(comp_df.loc[comp_df['target_activity'] == 0, 'target_name'].drop_duplicates())

3203

In [27]:
len(comp_df['name'].drop_duplicates())

3374

In [28]:
df_optimal = comp_df.loc[(comp_df['similarity'] > 0.8) &
                         (comp_df['similarity'] < 1.0) &
                         (comp_df['activity'] == 1) &
                         (comp_df['target_activity'] == 0)]

In [29]:
df_pos_sel = comp_df.loc[(comp_df['similarity'] > 0.8) &
                         (comp_df['similarity'] < 1.0) &
                         (comp_df['activity'] == 0) &
                         (comp_df['target_activity'] == 1)]

In [30]:
df_not_similar = comp_df.loc[(comp_df['similarity'] < 0.4) &
                         (comp_df['activity'] == 1) &
                         (comp_df['target_activity'] == 0)]

In [31]:
ed[ed['endocrine_disruptor'] == 'NO']

Unnamed: 0,name,mol_formula_curated,structure_curated,endocrine_disruptor,activity,canon_smiles,mols_rdkit,fps
2652,Dimethylcarbamoyl chloride,C3H6ClNO,CN(C)C(=O)Cl,NO,0,CN(C)C(=O)Cl,"<img data-content=""rdkit/molecule"" src=""data:i...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
3687,Terpineol,C10H18O,CC1=CCC(C(C)(C)O)CC1,NO,0,CC1=CCC(C(C)(C)O)CC1,"<img data-content=""rdkit/molecule"" src=""data:i...","[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, ..."


In [32]:
comp_df[(comp_df['name'].isin(ed.loc[ed['endocrine_disruptor'] == 'NO','name'])) &
        (comp_df['target_activity'] == 0)].sort_values(by=['similarity'], ascending=False)

In [33]:
df_optimal.to_excel('ED_RAX_results_similairty_ht_0.8.xlsx')

In [34]:
len(df_optimal.loc[df_optimal['activity'] == 1, 'name'].drop_duplicates())

52

In [35]:
len(df_optimal.loc[df_optimal['target_activity'] == 0, 'target_name'].drop_duplicates())

76

In [36]:
not_sim_sample = df_not_similar.loc[df_not_similar['target_activity'] == 0, 'target_name'].drop_duplicates().sample(16).values

In [37]:
not_sim_selection = df_not_similar.loc[df_not_similar['target_name'].isin(not_sim_sample),:].sort_values(by=['target_name'])

In [38]:
neg_sample = df_optimal.loc[df_optimal['target_activity'] == 0, 'target_name'].drop_duplicates().sample(16).values

In [39]:
neg_sel = df_optimal.loc[df_optimal['target_name'].isin(neg_sample),:].sort_values(by=['target_name'])

In [40]:
pos_sel = df_optimal.loc[df_optimal['activity'] == 1, :].drop_duplicates().sample(4)

In [41]:
sel_dict = {'Selected substance':[],'Selected structure':[],'Compared substance':[],'Compared substance structure':[], 'Similarity':[]}
for i, row in not_sim_selection.iterrows():
    sub_sel = row['target_name']
    struc_sel = row['target_structure']
    similarity = row['similarity']
    sub_sim = row['name']
    sub_struc = row['name_structure']
    if sub_sel not in sel_dict['Selected substance']:
        sel_dict['Selected substance'].append(sub_sel)
        sel_dict['Selected structure'].append(struc_sel)
        sel_dict['Compared substance'].append(sub_sim)
        sel_dict['Compared substance structure'].append(sub_struc)
        sel_dict['Similarity'].append(similarity)
    else:
        continue
sel_df = pd.DataFrame(sel_dict)

In [42]:
#sel_df.to_excel('ED_not_similar_selection.xlsx')

In [43]:
sel_df.iloc[15].values

array(['n-Octyl-polyoxyethylene', 'CCCCCCCCOCCO', 'Endosulfan',
       'O=S1OCC2C(CO1)C1(Cl)C(Cl)=C(Cl)C2(Cl)C1(Cl)Cl', 0.359375],
      dtype=object)

In [39]:
pos_sel.to_excel('ED_positive_selection.xlsx')

In [40]:
sel_df.to_excel('ED_negative_selection.xlsx')

In [41]:
selected_names_pos = df_optimal.loc[df_optimal['activity'] == 1, 'name'].drop_duplicates().values

In [44]:
noinfo_similar_set = ed[ed['name'].isin(df_optimal.loc[df_optimal['target_activity'] == 0, 'target_name'].drop_duplicates().values)]

In [45]:
noinfo_similar_set.drop(columns=['canon_smiles','mols_rdkit','fps'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [47]:
noinfo_similar_set.drop_duplicates()

Unnamed: 0,name,mol_formula_curated,structure_curated,endocrine_disruptor,activity
24,Dimethyl phthalate (DMP),C10H10O4,COC(=O)c1ccccc1C(=O)OC,No information,0
35,Undecafluorohexanoic acid,C6HF11O2,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,No information,0
36,"1,1,2,2,3,3,4,4,5,5,6,6,6-tridecafluorohexane-...",C6HF13O3S,O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,No information,0
42,Nonafluorovaleric Acid,C5HF9O2,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F,No information,0
43,Tridecafluoroheptanoic Acid,C7HF13O2,N.O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F...,No information,0
50,Henicosafluorodecanesulphonic acid,C10HF21O3S,O=S(=O)(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,No information,0
61,"2,2,3,3,4,4,5,5,6,6,7,7-Dodecafluoroheptanoic ...",C7H2F12O2,O=C(O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)F,No information,0
65,"2,3,4,6-Tetrachlorophenol",C6H2Cl4O,Oc1c(Cl)cc(Cl)c(Cl)c1Cl,No information,0
66,"2,3,5,6-Tetrachlorophenol (2,3,5,6-TeCP)",C6H2Cl4O,Oc1c(Cl)c(Cl)cc(Cl)c1Cl,No information,0
67,"2,3,4,5-Tetrachlorophenol (2,3,4,5-TeCP)",C6H2Cl4O,Oc1cc(Cl)c(Cl)c(Cl)c1Cl,No information,0


In [44]:
# PandasTools.AddMoleculeColumnToFrame(noinfo_similar_set,'structure_curated')
# no_mol = noinfo_similar_set[noinfo_similar_set['ROMol'].isna()]
# noinfo_similar_set.drop(no_mol.index, axis=0, inplace=True)
# noinfo_similar_set['ROMol'] = [Chem.AddHs(x) for x in noinfo_similar_set['ROMol'].values.tolist()]

In [45]:
# df_bad = comp_df.loc[(comp_df['similarity'] < 0.5) &
#                          (comp_df['similarity'] > 0.3) &
#                          (comp_df['activity'] == 1) &
#                          (comp_df['target_activity'] == 0)]

In [46]:
# noinfo_bad = ed[ed['name'].isin(df_optimal.loc[df_optimal['target_activity'] == 0, 'target_name'].drop_duplicates().values)].sample(80)

In [47]:
# noinfo_bad.drop(columns=['canon_smiles','mols_rdkit','fps'],inplace=True)

In [48]:
# PandasTools.AddMoleculeColumnToFrame(noinfo_bad,'structure_curated')
# no_mol = noinfo_bad[noinfo_bad['ROMol'].isna()]
# noinfo_bad.drop(no_mol.index, axis=0, inplace=True)
# noinfo_bad['ROMol'] = [Chem.AddHs(x) for x in noinfo_bad['ROMol'].values.tolist()]

In [49]:
# pos_pred = ed[ed['name'].isin(selected_names_pos)].drop_duplicates()

In [50]:
# pos_pred.drop(columns=['canon_smiles','mols_rdkit','fps'],inplace=True)

In [51]:
# PandasTools.AddMoleculeColumnToFrame(pos_pred,'structure_curated')
# no_mol = pos_pred[pos_pred['ROMol'].isna()]
# pos_pred.drop(no_mol.index, axis=0, inplace=True)
# pos_pred['ROMol'] = [Chem.AddHs(x) for x in pos_pred['ROMol'].values.tolist()]

In [52]:
# predict_sdf = pd.concat([noinfo_similar_set,noinfo_bad,pos_pred])

In [53]:
# PandasTools.WriteSDF(predict_sdf, 'ed_validation_RAX.sdf', molColName='ROMol', properties=list(predict_sdf.columns), idName='name')