#### Reprocessing raw output files from the OECD Toolbox and Biotransformer in light of reviewer comments

In [1]:
import numpy as np
import os as os
import pandas as pd
import seaborn as sns
import sygma

from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.Draw import MolsToGridImage

from IPython.core.display import HTML


In [2]:
TOP = os.getcwd().replace('notebooks', '')

In [3]:
raw_dat_dir = TOP + 'data/raw/'
processed_dat_dir = TOP + 'data/processed/'
interim_dat_dir = TOP + 'data/interim/'
figures_dir = TOP + 'reports/figures/'
external_dir = TOP + 'data/external/'


In [4]:
import sys

sys.path.append(TOP + 'src/data/')

from model_comp import *

In [5]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [7]:
DSSToxList = pd.read_csv(raw_dat_dir+"CompToxList.csv", header = 0)
DSSToxList = DSSToxList.rename(columns={'INCHIKEY':'Parent_INCHIKEY'})
DSSToxList['QSAR_READY_INCHIKEY'] = SMILES_to_InchiKey(DSSToxList['QSAR_READY_SMILES'],stereoisomer = False)
#DSSToxList.to_csv(raw_dat_dir + 'Processed_CompToxList_v1.csv')
Norm_DTXSID_dict = dict(zip(DSSToxList['Parent_INCHIKEY'],DSSToxList['DTXSID']))
QSAR_DTXSID_dict = dict(zip(DSSToxList['QSAR_READY_INCHIKEY'],DSSToxList['DTXSID']))

#### ToolBox in vitro predictions

In [9]:
pd.read_excel(raw_dat_dir+'TB_out_ivt.xlsx', sheet_name = None).keys()

dict_keys(['TB_out_ivt'])

In [10]:
TB_ivt = pd.read_excel(raw_dat_dir+'TB_out_ivt.xlsx', sheet_name = 'TB_out_ivt')

In [12]:
import re
p = re.compile(r'DTXSID\d{1,}')


In [13]:
dtxsid = [m.group(0) for l in TB_ivt['Chemical name(s)'] for m in [p.search(l)] if m]

In [18]:
len(dtxsid)

231

In [19]:
TB_ivt['DTXSID'] = dtxsid

In [21]:
TB_ivt[['DTXSID', ]]

Index(['#', 'CAS Number', 'Chemical name(s)', 'SMILES', 'Simulator name',
       'Metabolite', 'DTXSID'],
      dtype='object')

In [23]:
TB_ivt = TB_ivt[['DTXSID', 'SMILES', 'Metabolite']]

In [26]:
def TBivt_cleanup(df, DTXSIDdict):
    """Cleans and returns  a dataframe for the exported OCED Toolbox metabolite data.""" 
    """"Input file requires that SMILES by exported as part of the .csv file. The DTXSDIdict argument should be a dictionary with the QSAR Ready InChI keys as the key and the DSTXID as teh value."""
    """If issues occur reading the file, try coding = 'UTF-16' and delimiter = '\t' """                                                                          #Removes empty bottom row
    df = df[df.Metabolite.notnull()]                                                        #Establishes boolean list indicating indecies with metabolite
    df['Metabolite_INCHIKEY'] = SMILES_to_InchiKey(df['Metabolite'])                        #Converts metabolite SMILES to InChI keys
    df['Parent_INCHIKEY'] = SMILES_to_InchiKey(df['SMILES'],stereoisomer=False)             #Converts parent SMILES to QSAR Ready InChI keys (removes stereoisomer features during conversion)
    df['DTXSID_'] = [DTXSIDdict.get(e) for e in df['Parent_INCHIKEY']]                       #Uses dictionary of parent molecules to extract 
    df['TB_ivt'] = 1                                                                       #Generate column indicating the model source of the metabolite
    df = df.drop_duplicates()    
    df['Clean_SMILES'] = clean_SMILES(df['Metabolite'])                                    
    return df[['DTXSID', 'DTXSID_','Metabolite_INCHIKEY','Clean_SMILES','TB_ivt']];

In [28]:
ToolBox_ivt = TBivt_cleanup(TB_ivt, QSAR_DTXSID_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Metabolite_INCHIKEY'] = SMILES_to_InchiKey(df['Metabolite'])                        #Converts metabolite SMILES to InChI keys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Parent_INCHIKEY'] = SMILES_to_InchiKey(df['SMILES'],stereoisomer=False)             #Converts parent SMILES to QSAR Ready InChI keys (removes stereoisomer features during conversion)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume

In [41]:
ToolBox_ivt.drop('DTXSID', axis = 1, inplace = True)

In [42]:
ToolBox_ivt

Unnamed: 0,DTXSID_,Metabolite_INCHIKEY,Clean_SMILES,TB_ivt
1,DTXSID0044151,RPHKINMPYFJSCF-UHFFFAOYSA-N,Nc1cc(N)cc(N)c1,1
2,DTXSID0044151,DFWXYHZQNLIBLY-UHFFFAOYSA-N,Nc1cc(N)cc([N+](=O)[O-])c1,1
3,DTXSID0044151,VUCKZBYGRGKCCQ-UHFFFAOYSA-N,Nc1cc(NO)cc([N+](=O)[O-])c1,1
4,DTXSID0044151,WSIDAANKKVDFNY-UHFFFAOYSA-N,O=[N+]([O-])c1cc(NO)cc([N+](=O)[O-])c1,1
6,DTXSID6025725,VYBFQVODGUQVGO-UHFFFAOYSA-N,Nc1cc(O)cc([N+](=O)[O-])c1,1
...,...,...,...,...
224,DTXSID5021889,NZBRXFKHZBOFBW-UHFFFAOYSA-N,CC(=O)CC(C)CO,1
225,DTXSID5021889,SVTBMSDMJJWYQN-UHFFFAOYSA-N,CC(O)CC(C)(C)O,1
226,DTXSID5021889,PNJNLCNHYSWUPT-UHFFFAOYSA-N,CC(O)CC(C)CO,1
228,DTXSID7020762,CSCPPACGZOOCGX-UHFFFAOYSA-N,CC(C)=O,1


In [61]:
ToolBox_ivt.to_csv(processed_dat_dir+'Toolbox_ivt.csv')

#### Toolbox in vivo predictions

In [45]:
TB_iv = pd.read_excel(raw_dat_dir+'TB_out_iv.xlsx', sheet_name = 'TB_out_iv')

In [46]:
TB_iv

Unnamed: 0,#,Chemical name(s),SMILES,Simulator name,Metabolite
0,1,"3,5-Dinitro-phenylamine;3,5-Dinitroaniline;3,5...",Nc1cc(cc(c1)[N+]([O-])=O)[N+]([O-])=O,,
1,1,"3,5-Dinitro-phenylamine;3,5-Dinitroaniline;3,5...",Nc1cc(cc(c1)[N+]([O-])=O)[N+]([O-])=O,in vivo Rat metabolism simulator,Nc1cc(N)cc(c1)[N+]([O-])=O
2,2,3-Nitro-benzeneamine;3-Nitro-phenylamine;3-Nit...,Nc1cccc(c1)[N+]([O-])=O,,
3,2,3-Nitro-benzeneamine;3-Nitro-phenylamine;3-Nit...,Nc1cccc(c1)[N+]([O-])=O,in vivo Rat metabolism simulator,NC1=CC(=N)C=CC1=O
4,2,3-Nitro-benzeneamine;3-Nitro-phenylamine;3-Nit...,Nc1cccc(c1)[N+]([O-])=O,in vivo Rat metabolism simulator,Nc1ccc(O)c(N)c1
...,...,...,...,...,...
268,37,2-Propanone;Acetone;acetone 67-64-1 58.1 232 1...,CC(C)=O,in vivo Rat metabolism simulator,CC(O)C(O)=O
269,37,2-Propanone;Acetone;acetone 67-64-1 58.1 232 1...,CC(C)=O,in vivo Rat metabolism simulator,CC(O)C=O
270,37,2-Propanone;Acetone;acetone 67-64-1 58.1 232 1...,CC(C)=O,in vivo Rat metabolism simulator,CC(O)CO
271,37,2-Propanone;Acetone;acetone 67-64-1 58.1 232 1...,CC(C)=O,in vivo Rat metabolism simulator,CC=O


In [47]:
dtxsids = [m.group(0) for l in TB_iv['Chemical name(s)'] for m in [p.search(l)] if m]


In [49]:
TB_iv['DTXSID'] = dtxsids

In [52]:
TB_iv = TB_iv[['DTXSID', 'Metabolite','SMILES']]

In [53]:
def TBiv_cleanup(df, DTXSIDdict):
    """Cleans and returns  a dataframe for the exported OCED Toolbox metabolite data.""" 
    """"Input file requires that SMILES by exported as part of the .csv file. The DTXSDIdict argument should be a dictionary with the QSAR Ready InChI keys as the key and the DSTXID as teh value."""
    """If issues occur reading the file, try coding = 'UTF-16' and delimiter = '\t' """                                                                          #Removes empty bottom row
    df = df[df.Metabolite.notnull()]                                                        #Establishes boolean list indicating indecies with metabolite
    df['Metabolite_INCHIKEY'] = SMILES_to_InchiKey(df['Metabolite'])                        #Converts metabolite SMILES to InChI keys
    df['Parent_INCHIKEY'] = SMILES_to_InchiKey(df['SMILES'],stereoisomer=False)             #Converts parent SMILES to QSAR Ready InChI keys (removes stereoisomer features during conversion)
    df['DTXSID_'] = [DTXSIDdict.get(e) for e in df['Parent_INCHIKEY']]                       #Uses dictionary of parent molecules to extract 
    df['TB_iv'] = 1                                                                       #Generate column indicating the model source of the metabolite
    df = df.drop_duplicates()    
    df['Clean_SMILES'] = clean_SMILES(df['Metabolite'])                                    
    return df[['DTXSID', 'DTXSID_','Metabolite_INCHIKEY','Clean_SMILES','TB_iv']];

In [54]:
ToolBox_iv = TBiv_cleanup(TB_iv, QSAR_DTXSID_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Metabolite_INCHIKEY'] = SMILES_to_InchiKey(df['Metabolite'])                        #Converts metabolite SMILES to InChI keys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Parent_INCHIKEY'] = SMILES_to_InchiKey(df['SMILES'],stereoisomer=False)             #Converts parent SMILES to QSAR Ready InChI keys (removes stereoisomer features during conversion)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume

In [56]:
assert ToolBox_iv['DTXSID'].sum() == ToolBox_iv['DTXSID_'].sum()

In [58]:
ToolBox_iv.drop(['DTXSID_'], axis = 1, inplace = True)

In [59]:
ToolBox_iv

Unnamed: 0,DTXSID,Metabolite_INCHIKEY,Clean_SMILES,TB_iv
1,DTXSID0044151,DFWXYHZQNLIBLY-UHFFFAOYSA-N,Nc1cc(N)cc([N+](=O)[O-])c1,1
3,DTXSID6025725,BFMDYVWVKQUBGP-UHFFFAOYSA-N,N=C1C=CC(=O)C(N)=C1,1
4,DTXSID6025725,XIWMTQIUUWJNRP-UHFFFAOYSA-N,Nc1ccc(O)c(N)c1,1
5,DTXSID6025725,WZCQRUWWHSTZEM-UHFFFAOYSA-N,Nc1cccc(N)c1,1
7,DTXSID8031077,WSFSSNUMVMOOMR-UHFFFAOYSA-N,C=O,1
...,...,...,...,...
268,DTXSID8021482,JVTAAEKCZFNVCJ-UHFFFAOYSA-N,CC(O)C(=O)O,1
269,DTXSID8021482,BSABBBMNWQWLLU-UHFFFAOYSA-N,CC(O)C=O,1
270,DTXSID8021482,DNIAPMSPPWPWGF-UHFFFAOYSA-N,CC(O)CO,1
271,DTXSID8021482,IKHGUXGNUITLKF-UHFFFAOYSA-N,CC=O,1


In [62]:
ToolBox_iv.to_csv(processed_dat_dir+'Toolbox_iv.csv')

#### BioTransformer v3

In [65]:
BioTransformer_rev = pd.read_csv(raw_dat_dir+'metsim_0312021.csv')

In [66]:
bioTransformerDF = BioTransformer_cleanup(raw_dat_dir+'metsim_0312021.csv', Norm_DTXSID_dict)

In [70]:
bioTransformerDF.to_csv(processed_dat_dir+'BioTransformer3.csv')