#### Using the EPA Categories developed by G Helman which augment the implementation in the OECD Toolbox Version 4.5

<h1>Extract SMARTS</h1>

In [1]:
import pandas as pd
import numpy as np
import pymongo
import sys
import os
from datetime import datetime

from rdkit import Chem

import pymongo


In [2]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
figures_dir = TOP + 'reports/figures/'

In [3]:
with open(raw_dir + 'epa_categories.xml','r') as f:
    xml=f.read()

In [4]:
xml=xml.replace('\n','')

In [5]:
#xml

In [6]:
import xml.etree.ElementTree as ET
e=ET.parse(raw_dir+'epa_categories.xml').getroot()

In [7]:
e

<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Scheme' at 0x74514150e810>

In [8]:
parent_map = {c:p for p in e.iter() for c in p}

In [9]:
len(parent_map)

4545

In [10]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}

In [11]:
def define_smart_match(smart):
    pattern=Chem.MolFromSmarts(smart)
    if not pattern:
        return None
    def smart_match(x):
        mol=x['mol']
        ret=True if mol.GetSubstructMatches(pattern) else False
        return ret
    return smart_match
def define_compare(prop,operand,value):
    def compare(x):
        ret = op_dict[operand](x[prop_dict[prop]],value)
        return ret
    return compare

In [12]:
class Query:
    
    def __init__(self,xml,qid=None):
        self.xml=xml
        self.id=qid
        self.logic=None
        self.subqueries=[]
        self.category=None

        
    def write_query(self,qtype,tree):
        self.type=qtype
        if qtype=='b:StructureQuery':
            qstring=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
            qstring=re.sub('false','False',qstring)
            qstring=re.sub('true','True',qstring)
            qdict=ast.literal_eval(qstring)
            smart=qdict['queries'][0]['smart']
            self.smart=smart
            if '[Ch3,#1]' in self.smart:
                split=re.search(r'(.*)\[([^\(\)]*),([^\(\)].*)\]$',self.smart)
                split1=split.group(1)
                split2=split.group(1)+'['+split.group(2)+']'
                smart_match1=define_smart_match(split1)
                smart_match2=define_smart_match(split2)
                def smart_match(x):
                    return any([smart_match1(x),smart_match2(x)])
            else:
                smart_match=define_smart_match(smart)
            self.query=smart_match
        elif qtype=='b:ParameterQuery':
            self.operand=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Operand').text
            self.prop=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ParameterName').text
            self.value=float(self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Value').text)
            compare=define_compare(self.prop,self.operand,self.value)
            self.query=compare
        elif qtype=='LogicalQuery':
            self.logic=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Logic').text
            elements=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Elements')
            node_ids=[elem.attrib['{http://schemas.microsoft.com/2003/10/Serialization/}Ref']\
                      for elem in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')\
                      if '{http://schemas.microsoft.com/2003/10/Serialization/}Ref' in elem.attrib]
            if self.logic=='Not':
                node_id=node_ids[0] #Should only be one
                sq=tree[node_id]
                self.subqueries=[sq]
                def func(x):
                    return not(sq.query(x))
                self.query=func
            elif self.logic=='And':
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                def func(x):
                    return all([sq.query(x) for sq in self.subqueries])
                self.query=func
            else:
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                for orquery in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query'):
                    if '{http://www.w3.org/2001/XMLSchema-instance}type' in orquery.attrib:
                        extra_sq=Query(orquery)
                        extra_sq.write_query('b:StructureQuery',tree)
                        sqs.append(extra_sq)      
                def func(x):
                    return any([sq.query(x) for sq in self.subqueries])
                self.query=func
    
    def print_tree(self,x,tabs=0):
        
        smart_patterns = []
        logic_parameters = []
        
        qinfo=(self.id,self.type)
        if self.type=='b:StructureQuery':
            qinfo=qinfo+(self.smart,)
            smart_patterns.append(self.smart) 
        elif self.type=='b:ParameterQuery':
            qinfo=qinfo+(self.prop,self.value,self.operand)
            logic_parameters.append((self.prop, self.value, self.operand))
        elif self.type=='LogicalQuery':
            qinfo=qinfo+(self.logic,)
            logic_parameters.append(self.logic) 
        try:
            qinfo=qinfo+(self.query(x),)
        except:
            qinfo=qinfo+('does not process',)
        print('\t'*tabs+str(qinfo))
        for sq in self.subqueries:
                sub_smart_patterns, sub_logic_parameters = sq.print_tree(x,tabs+1)
                smart_patterns.extend(sub_smart_patterns)
                logic_parameters.extend(sub_logic_parameters)
        return smart_patterns, logic_parameters

In [13]:
q = Query(xml)

In [14]:
all_tests={}
print_tests = {}
bad_smarts=set()
bad_cats=set()
import re
import ast
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}anyType'):
    category=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Caption').text
    queries=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Expression')\
        .find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Queries')\
        .findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')
    contents=[query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Content') for query in queries]
    #print(query)
    query_tree={}
    for query in contents:
        attributes=query.attrib
        if '{http://schemas.microsoft.com/2003/10/Serialization/}Id' not in attributes:
            continue
        query_id=attributes['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
        #print(query_id)
        query_type=attributes['{http://www.w3.org/2001/XMLSchema-instance}type']
        #print(query_type)
        q=Query(query,query_id)
        q.category=category
        q.write_query(query_type,query_tree)
        q.print_tree(q.query)
        if not q.query or not all([sq.query for sq in q.subqueries]): #Smarts did not compile, sqs needed bc of hidden sqs in or queries
            bad_cats.add(category)
            if q.type=='b:StructureQuery':
                bad_smarts.add(q.smart)
        query_tree[query_id]=q
    all_tests[category]=query_tree[query_id]
    print_tests[category] = [query_id, query_type]#Final one should always be the top level query hopefully

('24', 'b:StructureQuery', '[#6,#16](=[#8])(Cl)[#6X4,c]', 'does not process')
('26', 'b:ParameterQuery', 'log Kow', 8.0, 'LessThan', 'does not process')
('29', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', 'does not process')
('32', 'LogicalQuery', 'And', 'does not process')
	('29', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', 'does not process')
	('26', 'b:ParameterQuery', 'log Kow', 8.0, 'LessThan', 'does not process')
	('24', 'b:StructureQuery', '[#6,#16](=[#8])(Cl)[#6X4,c]', 'does not process')
('43', 'b:StructureQuery', '[#6](=[#8])([#7h])[#6](=[#6h2])[Ch3,#1]', 'does not process')
('45', 'b:ParameterQuery', 'log Kow', 8.0, 'LessThan', 'does not process')
('48', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', 'does not process')
('51', 'LogicalQuery', 'And', 'does not process')
	('48', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', 'does not process')
	('45', 'b:ParameterQuery', 'log Kow', 8.0, 'LessThan', 'does not process')
	

RDKit ERROR: [18:15:44] SMARTS Parse Error: syntax error while parsing: [#7](=[#8])$[[#1],[#6X4]]{1..;xm}
RDKit ERROR: [18:15:44] SMARTS Parse Error: Failed parsing SMARTS '[#7](=[#8])$[[#1],[#6X4]]{1..;xm}' for input: '[#7](=[#8])$[[#1],[#6X4]]{1..;xm}'
RDKit ERROR: [18:15:44] SMARTS Parse Error: syntax error while parsing: $[[#1],[#6X4]]{1..;xm}[#7+][#8-]
RDKit ERROR: [18:15:44] SMARTS Parse Error: Failed parsing SMARTS '$[[#1],[#6X4]]{1..;xm}[#7+][#8-]' for input: '$[[#1],[#6X4]]{1..;xm}[#7+][#8-]'
RDKit ERROR: [18:15:44] SMARTS Parse Error: syntax error while parsing: c1ccccc1.$[[#1],$[[#6X4]]{1..}]{1..;x}
RDKit ERROR: [18:15:44] SMARTS Parse Error: Failed parsing SMARTS 'c1ccccc1.$[[#1],$[[#6X4]]{1..}]{1..;x}' for input: 'c1ccccc1.$[[#1],$[[#6X4]]{1..}]{1..;x}'
RDKit ERROR: [18:15:44] SMARTS Parse Error: syntax error while parsing: $[$[[#6X4]]{1..}]{1..;x}[Cl,Br,#8h]
RDKit ERROR: [18:15:44] SMARTS Parse Error: Failed parsing SMARTS '$[$[[#6X4]]{1..}]{1..;x}[Cl,Br,#8h]' for input: 

In [15]:
#print_tests

In [16]:
all_tests['Acid Chlorides']

<__main__.Query at 0x745141389880>

In [17]:
tests=all_tests.copy()
for category in bad_cats:
    del tests[category]

In [18]:
tests['Acid Chlorides']

<__main__.Query at 0x745141389880>

In [19]:
tests['Acid Chlorides']

<__main__.Query at 0x745141389880>

In [20]:
len(tests)

47

In [21]:
import json

# Create a dictionary to hold the JSON data
json_data = {}

# Iterate through each category in the tests dictionary
for category, query_instance in tests.items():
    # Call the updated print_tree method to get SMARTS and logic parameters
    smart_patterns, logic_parameters = query_instance.print_tree(query_instance.query)

    # Store the results in the json_data dictionary
    json_data[category] = {
        "SMARTS Patterns": smart_patterns,
        "Logic Parameters": logic_parameters
    }

# Convert the dictionary to a JSON string
json_output = json.dumps(json_data, indent=4)

# Print or save the JSON output
print(json_output)

# Optionally, save it to a file
with open(external_dir+'output_tests47.json', 'w') as json_file:
    json_file.write(json_output)

('32', 'LogicalQuery', 'And', 'does not process')
	('29', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', 'does not process')
	('26', 'b:ParameterQuery', 'log Kow', 8.0, 'LessThan', 'does not process')
	('24', 'b:StructureQuery', '[#6,#16](=[#8])(Cl)[#6X4,c]', 'does not process')
('51', 'LogicalQuery', 'And', 'does not process')
	('48', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', 'does not process')
	('45', 'b:ParameterQuery', 'log Kow', 8.0, 'LessThan', 'does not process')
	('43', 'b:StructureQuery', '[#6](=[#8])([#7h])[#6](=[#6h2])[Ch3,#1]', 'does not process')
('70', 'LogicalQuery', 'And', 'does not process')
	('67', 'b:ParameterQuery', 'Molecular weight', 1000.0, 'LessThan', 'does not process')
	('64', 'b:ParameterQuery', 'log Kow', 5.0, 'LessThan', 'does not process')
	('62', 'b:StructureQuery', '[#6h2]=[#6]([#6](=[#8])[#8])[Ch3,#1]', 'does not process')
('89', 'LogicalQuery', 'And', 'does not process')
	('86', 'b:ParameterQuery', 'Molecular weight', 10

<h1>Fix bad SMARTS</h1>

In [22]:
len(bad_cats)

19

In [65]:
class ChemicalTest:
    def __init__(self, name, smart_patterns, conditions, test_func):
        self.name = name
        self.smart_patterns = smart_patterns  # List of SMARTS patterns
        self.conditions = conditions  # Description of the logic
        self.test_func = test_func  # The actual test function

    def create_test_function(self):
        # Define the test function based on SMARTS patterns and conditions
        def test(x):
            mol = x['mol']
            mw = x['mol_weight']
            smiles = x['smiles']
            
            
            # Add any other required parameters from `x`
            # Evaluate conditions based on the SMARTS patterns
            match_found = any(mol.HasSubstructMatch(sp) for sp in self.smart_patterns)
            # Example condition evaluation (adjust as needed)
            return match_found and eval(self.conditions)
        return test

    def add_to_new_tests(self, new_tests):
        # Create the test function and add it to the new_tests dictionary
        self.test_func = self.create_test_function()
        new_tests[self.name] = self
    
    def get_details(self):
        """Returns the SMARTS patterns, conditions, and test function for this chemical test."""
        return {
            'name': self.name,
            'smart_patterns': [sp for sp in self.smart_patterns],
            'conditions': self.conditions,
            'test_func': self.test_func
        }




In [66]:
new_tests = {}

In [67]:
#Epoxides

epoxide=Chem.MolFromSmarts('c1oc1')
aziridine=Chem.MolFromSmarts('c1cn1([CH3,$(CH2CH3)])')
epoxides_test = ChemicalTest(
    name='Epoxides',
    smart_patterns=[epoxide, aziridine],
    conditions='mw < 1000',
    test_func=None
)

RDKit ERROR: [18:21:20] SMARTS Parse Error: syntax error while parsing: c1cn1([CH3,$(CH2CH3)_100])
RDKit ERROR: [18:21:20] SMARTS Parse Error: Failed parsing SMARTS 'c1cn1([CH3,$(CH2CH3)_100])' for input: 'c1cn1([CH3,$(CH2CH3)])'


In [68]:
epoxides_test.add_to_new_tests(new_tests)

In [69]:
#Aliphatic amines
primamine = Chem.MolFromSmarts('[NX3;H2;!$(NC=[O,N,S]);!$(NCN)][CX3]')
secamine = Chem.MolFromSmarts('[NX3;H1;!$(NC=[O,N,S]);!$(NCN)](C)[CX3]')
tertamine = Chem.MolFromSmarts('[N;!$(NC=[O,N,S]);!$(NCN)](C)(C)[CX3]')

# Define the test for Aliphatic Amines
conditions = "'c' not in smiles and mw < 1000 and '1' not in smiles"

# Create an instance of ChemicalTest for Aliphatic Amines
aliphatic_amines_test = ChemicalTest(
    name='Aliphatic Amines',
    smart_patterns=[primamine, secamine, tertamine],
    conditions=conditions,
    test_func=None  # Placeholder for the test function
)

# Add the test to new_tests
aliphatic_amines_test.add_to_new_tests(new_tests)

In [70]:
#Alkoxysilanes
alkoxy=Chem.MolFromSmarts('[CX4]O[SiX4]')

alkoxysilanes_test = ChemicalTest(
    name='Alkoxysilanes',
    smart_patterns=[alkoxy],
    conditions='mw < 1000',
    test_func=None
)

alkoxysilanes_test.add_to_new_tests(new_tests)

In [71]:
#Aminobenzothiazole Azo Dyes
azodye=Chem.MolFromSmiles('N=NC1=NC2=C(S1)C=CC=C2')
aminobenzothiazole_azodye_test = ChemicalTest(
    name='Aminobenzothiazole Azo Dyes',
    smart_patterns=[azodye],
    conditions=None,
    test_func=None
)
aminobenzothiazole_azodye_test.add_to_new_tests(new_tests)

In [72]:
#Benzotriazoles
benzotriazole=Chem.MolFromSmarts('n1c2ccccc2nn1')
benzotriazole2=Chem.MolFromSmarts('N1N=NC2=C1C=CC=C2')
benzotriazole_test = ChemicalTest(
    name='Benzotriazoles',
    smart_patterns=[benzotriazole, benzotriazole2],
    conditions=None,
    test_func=None
)
benzotriazole_test.add_to_new_tests(new_tests)

In [73]:
#Dianilines
dianiline = Chem.MolFromSmarts('c1cc([NH2])ccc1[CH2,O,N,S]c1ccccc1')
not_dianiline1 = Chem.MolFromSmarts('c1ccccc1[A]~[A]')
not_dianiline2 = Chem.MolFromSmarts('c1ccccc1[A](c)c')

# Conditions for the Dianilines test
conditions = "len(mol.GetSubstructMatches(dianiline)) == 2"

# Create an instance of ChemicalTest for Dianilines
dianilines_test = ChemicalTest(
    name='Dianilines',
    smart_patterns=[dianiline, not_dianiline1, not_dianiline2],
    conditions=f"not mol.HasSubstructMatch(not_dianiline1) and not mol.HasSubstructMatch(not_dianiline2) and {conditions}",
    test_func=None
)

# Add the test to new_tests
dianilines_test.add_to_new_tests(new_tests)


In [74]:
#Organotins (Acute toxicity)
organotin=Chem.MolFromSmarts('C[Sn]') 

organotins_acute_test = ChemicalTest(
    name='Organotins (Acute toxicity)',
    smart_patterns=[organotin],
    conditions='(mw <1000 and  logp <=13.7)',
    test_func=None
)
organotins_acute_test.add_to_new_tests(new_tests)

In [75]:
#Organotins (Chronic toxicity)
organotin=Chem.MolFromSmarts('C[Sn]') 

organotins_chronic_test = ChemicalTest(
    name='Organotins (Acute toxicity)',
    smart_patterns=[organotin],
    conditions='(mw <1000 and  logp >=13.7)',
    test_func=None
)
organotins_chronic_test.add_to_new_tests(new_tests)

In [26]:
#Anionic Surfactants
def create_test():
    sulfate=Chem.MolFromSmarts('COS(=O)(=O)[OH,O-]')
    sulfonate=Chem.MolFromSmarts('CS(=O)(=O)[OH,O-]')
    phosphate=Chem.MolFromSmarts('COP([OH1])([OH1])=O')
    carboxylic=Chem.MolFromSmarts('[CX3;!$(Cc)](=O)[OX2H1]')
    silicic=Chem.MolFromSmarts('[Si][OX2H]')
    def test(x):
        mol=x['mol']
        smiles=x['smiles']
        if set(smiles)-set(['C','c','O','P','S','i','[',']','(',')','=']):
            return False
        m=re.compile('\(.?C.?\)')
        if m.findall(smiles):
            return False
        rgroup_indexes=[i for i,atom in enumerate(smiles) if atom=='C']
        return (mol.HasSubstructMatch(sulfate) or mol.HasSubstructMatch(sulfonate)\
        or mol.HasSubstructMatch(phosphate) or mol.HasSubstructMatch(carboxylic)\
        or mol.HasSubstructMatch(silicic))\
        and sorted(rgroup_indexes)==range(min(rgroup_indexes),max(rgroup_indexes)+1) #Tests for straight alkyl chains
    return test
new_tests['Anionic Surfactants']=create_test()

In [30]:
def create_test():
    ethylenebisdithiocarbamate=Chem.MolFromSmiles('SC(=S)NCCNC(=S)S')
    dithiocarbamates=[]
    for i in range(1,5):
        for j in range(1,5):
            mol=Chem.MolFromSmiles('C'*i + 'NC(=S)S' + 'C'*j)
            dithiocarbamates.append(mol)
    dithiocarbamates.append(ethylenebisdithiocarbamate)
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and x['logp']<5 and any([mol.HasSubstructMatch(dithiocarbamate) for dithiocarbamate in dithiocarbamates])
    return test
new_tests['Dithiocarbamates (Acute toxicity)']=create_test()


def create_test():
    ethylenebisdithiocarbamate=Chem.MolFromSmiles('SC(=S)NCCNC(=S)S')
    dithiocarbamates=[]
    for i in range(1,5):
        for j in range(1,5):
            mol=Chem.MolFromSmiles('C'*i + 'NC(=S)S' + 'C'*j)
            dithiocarbamates.append(mol)
    dithiocarbamates.append(ethylenebisdithiocarbamate)
    def test(x):
        mol=x['mol']
        return x['mol_weight']<1000 and x['logp']>=5 and x['logp']<19 and any([mol.HasSubstructMatch(dithiocarbamate) for dithiocarbamate in dithiocarbamates])
    return test
new_tests['Dithiocarbamates (Chronic toxicity)']=create_test()

In [31]:
#Ethylene Glycol Ethers
#Have to enumerate       
def create_test():
    match_mols=[]
    for i in range(1,8):
        for j in range(0,8):
            for k in range(1,4):
                smart='C'*i+'OCC'*k+'O'+'C'*j
                match_mols.append(Chem.MolFromSmiles(smart))
    phenyl_mols=[]
    for i in range(0,7):
        for k in range(1,3):
            for l in range(0,3): #Technically could be any number but this is difficult to implement
                phenyl_smart='c1ccccc1'+'C'*l+'OCC'*k+'O'+'C'*i
                phenyl_mols.append(Chem.MolFromSmiles(phenyl_smart))
    
    def test(x):
        smiles=x['smiles']
        if set(smiles)-set(['C','c','O','1','(',')']):
            return False
        if smiles.count('O')<2:
            return False
        os=[i for i,o in enumerate(smiles) if o=='O']
        between_os=[smiles[(start+1):end] for start,end in zip(os,os[1:])]
        if any([between!='CC' for between in between_os]):
            return False
        m=re.compile('1.*O.*1')
        if m.findall(smiles):
            return False
        carbon1=smiles[0:min(os)]
        carbon2=smiles[(max(os)+1):]
        if carbon1.count('C')>7 or carbon2.count('C')>7:
            return False
        if carbon1.count('c')>6 or carbon2.count('c')>6:
            return False
        if not carbon1 and not carbon2:
            return False
        else:
            return True
    return test
new_tests['Ethylene Glycol Ethers']=create_test()

In [32]:
#Neutral Organics
#Verhaar scheme, see paper called Classifying Environmental Pollutants
def create_test():
    def test(x):
        mol=x['mol']
        if mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!c;!N;!O;!F;!Cl;!Br,I]')): #Rule 0.1 and 1.1
            return False
        logp=x['logp']
        if logp>8:
            return False
        mw=x['mol_weight']
        if mw>1000:
            return False
        if not mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!c]')): #Rule 1.3
            return True
        elif not mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!c;!Cl;!Br;!F]'))\
        and not mol.HasSubstructMatch(Chem.MolFromSmarts('[Cl,Br,F]C[$(C=C),$(Cc)]')): #Rule 1.4
            return True
        elif not mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!c;!O;!Cl;!Br;!F]')): #Rule 1.5
            if mol.HasSubstructMatch(Chem.MolFromSmarts('COC'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('COOC'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('C1OC1')): #Rule 1.5.1 and 1.7
                return True
            elif mol.HasSubstructMatch(Chem.MolFromSmarts('[C;!$(C=O)][OH]'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('C=CCO'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('C#CCO'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('cCO')): #Rule 1.5.2, 1.5.3, and 1.7CCCCOCCOCCO
                return True
            elif mol.HasSubstructMatch(Chem.MolFromSmarts('[C;!$(CO)]=O'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('[$(cC),$(C=C)]C=O'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('[Cl,Br]C=O'))\
            and not mol.HasSubstructMatch(Chem.MolFromSmarts('[Cl,Br]CC=O')): #Rule 1.5.4 and 1.7
                return True
            else:
                return False
        elif not mol.HasSubstructMatch(Chem.MolFromSmarts('[!C;!N]'))\
        and mol.HasSubstructMatch(Chem.MolFromSmarts('C[NH,NH0]')): #Rule 1.6
            return True
        else: 
            return False
    return test
new_tests['Neutral Organics']=create_test()

In [33]:
#Nonionic Surfactants
# nonsurf1=Chem.MolFromSmarts('COCCO')
# nonsurf2=Chem.MolFromSmarts('COCCOC')
# def test(x):
#     mol=x['mol']
#     return mol.HasSubstructMatch(nonsurf1) or mol.HasSubstructMatch(nonsurf2)
import re
def test(x):
    smiles=x['smiles']
    if '(' in smiles:
        return False
    split_smiles=smiles.split('O')
    if len(split_smiles)==1:
        return False
    mol=x['mol']
    if not mol.HasSubstructMatch(Chem.MolFromSmiles('COC')) or mol.HasSubstructMatch(Chem.MolFromSmiles('C=O')):
        return False
    return not any([re.search(r'[^C]',c) for c in split_smiles])
new_tests['Nonionic Surfactants']=test

In [34]:
#Nonionic Surfactants
import math
def create_test():
    def test(x):
        mol=x['mol']
        atoms=[a for a in x['smiles'].lower() if a.isalpha()]
        if set(atoms)-set(['c','o']):
            return False
        return mol.HasSubstructMatch(Chem.MolFromSmarts('[CH3][CR0][CR0][CR0][CR0][CR0]')) and\
        atoms.count('o')>1 and\
        (math.floor(len(mol.GetSubstructMatches(Chem.MolFromSmarts('O[CH2][CH2]')))/2)+1)==len(mol.GetSubstructMatches(Chem.MolFromSmarts('[O]')))
    return test
new_tests['Nonionic Surfactants']=create_test()

In [76]:
#Polynitroaromatics (Acute toxicity) 

polynitroaromatic=Chem.MolFromSmarts('ON(=O)[$(c1c(N(O)=O)cccc1),$(c1cc(N(O)=O)ccc1),$(c1ccc(N(O)=O)cc1),$(c1cncc(N(O)=O)c1)]')
conditions = "mw < 1000 and logp <7"

polynitroaromatics_acute_test = ChemicalTest(
    name='Polynitroaromatics (Acute toxicity) ',
    smart_patterns=[polynitroaromatic],
    conditions=f"(mol.HasSubstructMatch(polynitroaromatic)) and {conditions}",    
    test_func=None
)

# Add the test to new_tests
polynitroaromatics_acute_test.add_to_new_tests(new_tests)

In [77]:
#Polynitroaromatics (Chronic toxicity)

polynitroaromatic=Chem.MolFromSmarts('ON(=O)[$(c1c(N(O)=O)cccc1),$(c1cc(N(O)=O)ccc1),$(c1ccc(N(O)=O)cc1),$(c1cncc(N(O)=O)c1)]')
conditions = "mw < 1000 and logp >=10"

polynitroaromatics_chronic_test = ChemicalTest(
    name='Polynitroaromatics (Chronic toxicity)',
    smart_patterns=[polynitroaromatic],
    conditions=f"(mol.HasSubstructMatch(polynitroaromatic)) and {conditions}",    
    test_func=None
)

# Add the test to new_tests
polynitroaromatics_chronic_test.add_to_new_tests(new_tests)

In [78]:
# Substituted Triazines (Acute toxicity) 
subtriazine=Chem.MolFromSmarts('[$(n1nnccc1.[!#1]),$(n1ncncc1.[!#1]),$(n1cncnc1.[!#1])]')#
conditions = "mw < 1000 and logp <5"

substituted_triazines_acute_test = ChemicalTest(
    name='Substituted Triazines (Acute toxicity)',
    smart_patterns=[subtriazine],
    conditions=f"(mol.HasSubstructMatch(subtriazine)) and {conditions}",    
    test_func=None
)

# Add the test to new_tests
substituted_triazines_acute_test.add_to_new_tests(new_tests)

In [79]:
# Substituted Triazines (Chronic toxicity)
subtriazine=Chem.MolFromSmarts('[$(n1nnccc1.[!#1]),$(n1ncncc1.[!#1]),$(n1cncnc1.[!#1])]')#
conditions = "mw < 1000 and logp <=8 and logp>5"

substituted_triazines_chronic_test = ChemicalTest(
    name='Substituted Triazines (Chronic toxicity)',
    smart_patterns=[subtriazine],
    conditions=f"(mol.HasSubstructMatch(subtriazine)) and {conditions}",    
    test_func=None
)

# Add the test to new_tests
substituted_triazines_chronic_test.add_to_new_tests(new_tests)

In [38]:
def convert_ppb(x): #OPERA results stored as mol/L
    ws=x['ws']
    mol_weight=x['mol_weight']
    return ws*mol_weight*10**6

In [40]:
#Triarylmethane Pigments/Dyes with Non-solubilizing Groups
def create_test():
    para_permutations='[NH2,O,$([NH1][CH3]),$([NH1][CH2][CH3]),$(N([CH3])[CH3]),$(N([CH3])[CH2][CH3]),$(N([CH2][CH3])[CH2][CH3])]'
    triphenylmethane=Chem.MolFromSmarts('[cH]1[cH]c({})[cH][cH]c1C(c2[cH][cH]c({})[cH][cH]2)=C3[CH]=[CH]C(=[NH,O])[CH]=[CH]3'.format(para_permutations,para_permutations))
    diphenylnaphthylmethane=Chem.MolFromSmarts('[cH]1[cH]c({})[cH][cH]c1C(c2[cH][cH]c({})[cH]3[cH][cH][cH][cH][cH]32)=C3[CH]=[CH]C(=[NH,O])[CH]=[CH]3'.format(para_permutations,para_permutations))
    def test(x):
        mol=x['mol']
        return convert_ppb(x)>1 and (mol.HasSubstructMatch(triphenylmethane) or (mol.HasSubstructMatch(diphenylnaphthylmethane)))
    return test
new_tests['Triarylmethane Pigments/Dyes with Non-solubilizing Groups']=create_test()

In [41]:
#beta-Naphthylamines, Sulfonated
def create_test():
    smarts=[]
    match_mols=[]
    prefix='[NH2]c1[cH1,$(cO)]'
    suffix='[cH][cH]1'
    for c1 in range(1,4):
        for c2 in range(c1+1,5):
            smarts.append(prefix+'c2'+'[cH]'*(c1-1)+'[cH,$(c[OH]),$(c[NH2])]'+'[cH]'*(c2-c1-1)+'c([$(S(=O)(=O)[OH]),$(S(=O)(=O)[CH2][CH2]S[OH3])])'+'[cH]'*(4-c2)+'c2'+suffix)
            smarts.append(prefix+'c2'+'[cH]'*(c1-1)+'c([$(S(=O)(=O)[OH]),$(S(=O)(=O)[CH2][CH2]S[OH3])])'+'[cH]'*(c2-c1-1)+'[cH1,$(c[OH]),$(c[NH2])]'+'[cH]'*(4-c2)+'c2'+suffix)
    match_mols=[Chem.MolFromSmarts(smart) for smart in smarts]
    def test(x):
        mol=x['mol']
        naph_matches=[True for match in match_mols[:] if mol.HasSubstructMatch(match) and match.HasSubstructMatch(mol)]
        return any(naph_matches)
    return test
new_tests['beta-Naphthylamines, Sulfonated']=create_test()

In [39]:
#Aldehydes (Acute)

formaldehyde=Chem.MolFromSmarts('[CH2](=O)') #Needs to be special case because buggy way RDKit handles hydrogens
aldehyde=Chem.MolFromSmarts('[CH1](=[O])[C,c]')

conditions = "mw < 1000 and logp <=6"

aldehydes_acute_test = ChemicalTest(
    name='Aldehydes (Acute)',
    smart_patterns=[formaldehyde, aldehyde],
    conditions=f"(mol.HasSubstructMatch(formaldehyde) or mol.HasSubstructMatch(aldehyde)) and {conditions}",    
    test_func=None
)

# Add the test to new_tests
aldehydes_acute_test.add_to_new_tests(new_tests)

In [80]:
#Aldehydes (Chronic)
formaldehyde=Chem.MolFromSmarts('[CH2](=O)') #Needs to be special case because buggy way RDKit handles hydrogens
aldehyde=Chem.MolFromSmarts('[CH1](=[O])[C,c]')
conditions = "mw < 1000 and logp >6"

aldehydes_chronic_test = ChemicalTest(
    name='Aldehydes (Chronic)',
    smart_patterns=[formaldehyde, aldehyde],
    conditions=f"(mol.HasSubstructMatch(formaldehyde) or mol.HasSubstructMatch(aldehyde)) and {conditions}",      
    test_func=None
)

# Add the test to new_tests
aldehydes_chronic_test.add_to_new_tests(new_tests)

In [81]:
#Imides (Acute toxicity)

imide=Chem.MolFromSmarts('C(=O)NC(=O)')
not_imide=Chem.MolFromSmarts('c1C(=O)NC(=O)ccccc1')
conditions = "mw < 1000 and logp <=5"

    
imides_acute_test = ChemicalTest(
    name='Imides (Acute toxicity)',
    smart_patterns=[imide, not_imide],
    conditions=f"(mol.HasSubstructMatch(imide) and not mol.HasSubstructMatch(not_imide) and {conditions}",    
    test_func=None
)

# Add the test to new_tests
imides_acute_test.add_to_new_tests(new_tests)
    

In [82]:
#Imides (Chronic toxicity)

imide=Chem.MolFromSmarts('C(=O)NC(=O)')
not_imide=Chem.MolFromSmarts('c1C(=O)NC(=O)ccccc1')
conditions = "mw < 1000 and logp > 5 and logp<8"

imides_chronic_test = ChemicalTest(
    name='Imides (Chronic toxicity)',
    smart_patterns=[imide, not_imide],
    conditions=f"(mol.HasSubstructMatch(imide) and not mol.HasSubstructMatch(not_imide) and {conditions}",    
    test_func=None
)

# Add the test to new_tests
imides_chronic_test.add_to_new_tests(new_tests)

In [83]:
#Hydrazines and related compounds

hydra1 = Chem.MolFromSmarts('[NX3][NX3]')
hydra2 = Chem.MolFromSmarts('[CX3]=[NX2][NX2]')
hydra3 = Chem.MolFromSmarts('[CX3](=O)[NX2][NX3]')
hydra4 = Chem.MolFromSmarts('[NX2][CX3](=O)[NX2][NX3]')

# Conditions for the Hydrazines test
conditions = "mw < 500"

# Create an instance of ChemicalTest for Hydrazines and Related Compounds
hydrazines_test = ChemicalTest(
    name='Hydrazines and Related Compounds',
    smart_patterns=[hydra1, hydra2, hydra3, hydra4],
    conditions=f"(mol.HasSubstructMatch(hydra1) or mol.HasSubstructMatch(hydra2) or mol.HasSubstructMatch(hydra3) or mol.HasSubstructMatch(hydra4)) and {conditions}",
    test_func=None
)

# Add the test to new_tests
hydrazines_test.add_to_new_tests(new_tests)


In [84]:
#Thiols (Acute toxicity)
thiol=Chem.MolFromSmarts('[C,c][SX2H]')
conditions = "mw < 1000 and logp >=6.5"
thiols_acute_test = ChemicalTest(
    name='Thiols (Acute toxicity)',
    smart_patterns=[thiol],
    conditions=f"(mol.HasSubstructMatch(thiol) and {conditions}",
    test_func=None
)

# Add the test to new_tests
thiols_acute_test.add_to_new_tests(new_tests)


In [85]:
#Thiols (Chronic toxicity)
thiol=Chem.MolFromSmarts('[C,c][SX2H]')
conditions = "mw < 1000 and logp >=6.5 and logp<9"
thiols_chronic_test = ChemicalTest(
    name='Thiols (Chronic toxicity)',
    smart_patterns=[thiol],
    conditions=f"(mol.HasSubstructMatch(thiol) and {conditions}",
    test_func=None
)

# Add the test to new_tests
thiols_chronic_test.add_to_new_tests(new_tests)

In [86]:
#Acrylamides

acrylamide1=Chem.MolFromSmarts('[CH2]=[CH1]C(=O)[NH,NH2]')
acrylamide2=Chem.MolFromSmarts('[CH2]=C([CH3])C(=O)[NH,NH2]')
conditions = "mw < 1000 and  logp<8"
acrylamides_test = ChemicalTest(
    name='Acrylamides',
    smart_patterns=[acrylamide1, acrylamide2],
    conditions=f"(mol.HasSubstructMatch(acrylamide1) or mol.HasSubstructMatch(acrylamide2)) and {conditions}",
    test_func=None
)

# Add the test to new_tests
acrylamides_test.add_to_new_tests(new_tests)

In [87]:
# Acrylates/Methacrylates (Acute toxicity)
acrylate=Chem.MolFromSmarts('[CH2]=[CH]C(=O)O')
methacrylate=Chem.MolFromSmarts('[CH2]=C([CH3])C(=O)O')
conditions = "logp<=5 and mw<1000"
acrylates_methacrylates_acute_test = ChemicalTest(
    name='Acrylates/Methacrylates (Acute toxicity)',
    smart_patterns=[acrylate, methacrylate],
    conditions=f"(mol.HasSubstructMatch(acrylate) or mol.HasSubstructMatch(methacrylate)) and {conditions}",
    test_func=None
)

# Add the test to new_tests
acrylates_methacrylates_acute_test.add_to_new_tests(new_tests)


In [88]:
# Acrylates/Methacrylates (Chronic toxicity)
acrylate=Chem.MolFromSmarts('[CH2]=[CH]C(=O)O')
methacrylate=Chem.MolFromSmarts('[CH2]=C([CH3])C(=O)O')
conditions = "logp>5 and logp<8 and mw<1000"

acrylates_methacrylates_chronic_test = ChemicalTest(
    name='Acrylates/Methacrylates (Chronic toxicity)',
    smart_patterns=[acrylate, methacrylate],
    conditions=f"(mol.HasSubstructMatch(acrylate) or mol.HasSubstructMatch(methacrylate)) and {conditions}",
    test_func=None
)

# Add the test to new_tests
acrylates_methacrylates_chronic_test.add_to_new_tests(new_tests)

In [89]:
len(new_tests)

20

In [90]:
new_tests

{'Epoxides': <__main__.ChemicalTest at 0x745140fe0e50>,
 'Aliphatic Amines': <__main__.ChemicalTest at 0x74514138f370>,
 'Alkoxysilanes': <__main__.ChemicalTest at 0x745140fe0790>,
 'Aminobenzothiazole Azo Dyes': <__main__.ChemicalTest at 0x745140f2e340>,
 'Benzotriazoles': <__main__.ChemicalTest at 0x745140f2e190>,
 'Dianilines': <__main__.ChemicalTest at 0x745140f2e8b0>,
 'Organotins (Acute toxicity)': <__main__.ChemicalTest at 0x745140fe0eb0>,
 'Polynitroaromatics (Acute toxicity) ': <__main__.ChemicalTest at 0x745140fe00d0>,
 'Polynitroaromatics (Chronic toxicity)': <__main__.ChemicalTest at 0x745140fe0a30>,
 'Substituted Triazines (Acute toxicity)': <__main__.ChemicalTest at 0x745141352160>,
 'Substituted Triazines (Chronic toxicity)': <__main__.ChemicalTest at 0x7451766e5700>,
 'Aldehydes (Chronic)': <__main__.ChemicalTest at 0x745140f2e430>,
 'Imides (Acute toxicity)': <__main__.ChemicalTest at 0x745140f2ed00>,
 'Imides (Chronic toxicity)': <__main__.ChemicalTest at 0x745140f2e8

In [51]:
#new_tests.update({k:q.query for k,q in tests.items()})

In [91]:
len(new_tests)

20

In [102]:
example = new_tests['Aliphatic Amines']
out = example.get_details()

In [103]:
[Chem.MolToSmarts(e) for e in out['smart_patterns'] if e is not None]

['[N&X3&H2&!$(NC=[O,N,S])&!$(NCN)][C&X3]',
 '[N&X3&H1&!$(NC=[O,N,S])&!$(NCN)](-,:C)[C&X3]',
 '[N&!$(NC=[O,N,S])&!$(NCN)](-,:C)(-,:C)[C&X3]']

In [95]:
[Chem.MolToSmarts(s) for s in out['smart_patterns'] if s not None]

SyntaxError: invalid syntax (<ipython-input-95-4f1e064bf080>, line 1)

In [55]:
new_tests['Aliphatic Amines']

<__main__.ChemicalTest at 0x74514134ffa0>

In [56]:
new_tests['Acrylates/Methacrylates (Chronic toxicity)']

<__main__.ChemicalTest at 0x74514135a670>

In [57]:
list(new_tests.keys())[:2]

['Epoxides', 'Aliphatic Amines']

In [58]:
list(new_tests.values())[:2]

[<__main__.ChemicalTest at 0x7451766e5e20>,
 <__main__.ChemicalTest at 0x74514134ffa0>]

In [60]:
for x, y in new_tests.items():
    print(x,y.get_details())

ArgumentError: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmarts(NoneType)
did not match C++ signature:
    MolToSmarts(RDKit::ROMol mol, bool isomericSmiles=True)

In [50]:
sorted(list(new_tests.keys()))[:10]

['Acid Chlorides',
 'Acrylamides',
 'Acrylates/Methacrylates (Acute toxicity)',
 'Acrylates/Methacrylates (Chronic toxicity)',
 'Aldehydes (Acute toxicity)',
 'Aldehydes (Chronic toxicity)',
 'Aliphatic Amines',
 'Alkoxysilanes',
 'Aluminum Compounds',
 'Aminobenzothiazole Azo Dyes']

In [83]:
import pymongo
from pymongo import MongoClient

In [92]:
client = MongoClient('mongodb://removed:27017/')

In [93]:
client.list_database_names()

['admin', 'config', 'genra_db', 'genra_dev_v5', 'local']

In [94]:
DB = client.genra_dev_v5

In [95]:
DB.list_collection_names()

['toxref_tr_fp',
 'httr_rj_heparg',
 'toxcast_assays',
 'compounds',
 'chms_fp',
 'toxval_v9',
 'toxref_effects',
 'physprop',
 'htpp_cr_fp_0',
 'toxcast_fp',
 'httr_cr_fp_0',
 'httr_rj_u2os',
 'toxcast_fp_1']

In [97]:
#[e for e in DB2.physprop.find({'dsstox_sid': 'DTXSID8059718'})]

In [98]:
loael_smiles=DB.compounds.find({},{'_id':0,'dsstox_sid':1,'smiles':1}).limit(10)

In [99]:
smiles_dict={record['dsstox_sid']:record['smiles'] for record in loael_smiles if record['smiles']}

In [100]:
smiles_dict

{'DTXSID7020001': 'NC1=NC2=C(C=C1)C1=CC=CC=C1N2',
 'DTXSID2020004': 'CC=NO',
 'DTXSID7020005': 'CC(N)=O',
 'DTXSID2020006': 'CC(=O)NC1=CC=C(O)C=C1',
 'DTXSID7020007': 'CC(=O)C1=CC=C(C=C1)S(=O)(=O)NC(=O)NC1CCCCC1',
 'DTXSID2020008': 'CC(C)=NNC1=NC=C(S1)C1=CC=C(O1)[N+]([O-])=O',
 'DTXSID7020009': 'CC#N',
 'DTXSID6020010': 'CC(C)=NO',
 'DTXSID6020012': 'CC(=O)NNC1=CC=C(CO)C=C1',
 'DTXSID1020013': 'CC(=O)NNC(=O)C1=CC=NC=C1'}

In [71]:
#[e for e in DB2.physprop.find({},{'_id':0,'dsstox_sid':1,'predicted_props.OPERA_LogP':1}).limit(10)]

In [101]:
DB.physprop.find_one({})

{'_id': ObjectId('59c92a22a8d0e8591b32a18f'),
 'predicted_props': {'ACD_FP': [-25.127],
  'NICEATM_MP': [5.14578],
  'NICEATM_VP': [12.158],
  'OPERA_AOH': [2.05998e-12, 2.05995e-12],
  'OPERA_HL': [3.32082e-05],
  'EPISUITE_BP': [118.18],
  'OPERA_WS': [5.8831],
  'ACD_Prop_Molar_Refractivity': [15.478],
  'ACD_Prop_Index_Of_Refraction': [1.393],
  'ACD_Prop_Surface_Tension': [28.315],
  'OPERA_BCF': [6.17193],
  'OPERA_RBiodeg': [0.0],
  'ACD_LogP_v_LogP_Classic': [0.813],
  'EPISUITE_LOGP': [-0.21],
  'EPISUITE_WS': [2.99993],
  'OPERA_BIODEG': [4.59228],
  'NICEATM_BCF': [2.13379],
  'OPERA_LogP': [0.267325],
  'NICEATM_BP': [114.24],
  'ACD_Prop_Parachor': [149.464],
  'ACD_Prop_Density': [0.912],
  'OPERA_VP': [49.9845],
  'OPERA_BP': [110.309],
  'ACD_BP': [19.843],
  'ACD_Prop_Molar_Volume': [64.794],
  'NICEATM_LogS': [1.18613],
  'NICEATM_LogP': [0.404076],
  'EPISUITE_MP': [-73.62],
  'OPERA_KM': [0.112998],
  'ACD_Prop_Polarizability': [6.136],
  'OPERA_MP': [31.3202],
  'O

In [102]:
loael_logp = [e for e in DB.physprop.find({'dsstox_sid': {'$in': list(smiles_dict.keys())}}, {"dsstox_sid":1, "predicted_props.OPERA_LogP" : 1, "_id":0})]

In [103]:
loael_logp[0]

{'predicted_props': {'OPERA_LogP': [0.267325]}, 'dsstox_sid': 'DTXSID2020004'}

In [104]:
logp_dict = {record['dsstox_sid']: record.get('predicted_props',{})['OPERA_LogP'][0] for record in loael_logp}

In [105]:
logp_dict

{'DTXSID2020004': 0.267325,
 'DTXSID7020005': -1.2057,
 'DTXSID2020006': 0.875449,
 'DTXSID7020007': 1.65144,
 'DTXSID2020008': 1.54363,
 'DTXSID7020009': -0.316894,
 'DTXSID6020010': 0.882667,
 'DTXSID6020012': 0.764281,
 'DTXSID1020013': -0.577089,
 'DTXSID7020001': 2.63969}

In [117]:
loael_ws = [e for e in DB.physprop.find({'dsstox_sid': {'$in': list(smiles_dict.keys())}}, {"dsstox_sid":1, "predicted_props.OPERA_WS" : 1, "_id":0})]

In [119]:
#loael_ws

In [120]:
ws_dict = {record['dsstox_sid']: record.get('predicted_props',{})['OPERA_WS'][0] for record in loael_ws}
ws_dict

{'DTXSID2020004': 5.8831,
 'DTXSID7020005': 19.0029,
 'DTXSID2020006': 1.11026,
 'DTXSID7020007': 0.00139472,
 'DTXSID2020008': 3.31981e-05,
 'DTXSID7020009': 8.59913,
 'DTXSID6020010': 1.69356,
 'DTXSID6020012': 0.686068,
 'DTXSID1020013': 0.380852,
 'DTXSID7020001': 0.015397}

In [121]:
DB.compounds.find_one({})

{'_id': ObjectId('609430c44ce714020b32837e'),
 'casrn': '26148-68-5',
 'chemspider_id': 56541.0,
 'dsstox_cid': 'DTXCID101',
 'dsstox_sid': 'DTXSID7020001',
 'gsid': 20001,
 'inchi_key': 'FJTNLJLPLJDTRM-UHFFFAOYSA-N',
 'iupac': '9H-Pyrido[2,3-b]indol-2-amine',
 'mol_weight': 183.214,
 'name': 'A-alpha-C',
 'pubchem_cid': 62805.0,
 'smiles': 'NC1=NC2=C(C=C1)C1=CC=CC=C1N2'}

In [122]:
loael_weight = [e for e in DB.compounds.find({'dsstox_sid': {'$in': list(smiles_dict.keys())}}, {"dsstox_sid":1, "mol_weight" : 1, "_id":0})]

In [123]:
loael_weight

[{'dsstox_sid': 'DTXSID1020013', 'mol_weight': 179.179},
 {'dsstox_sid': 'DTXSID2020004', 'mol_weight': 59.068},
 {'dsstox_sid': 'DTXSID2020006', 'mol_weight': 151.165},
 {'dsstox_sid': 'DTXSID2020008', 'mol_weight': 266.28},
 {'dsstox_sid': 'DTXSID6020010', 'mol_weight': 73.095},
 {'dsstox_sid': 'DTXSID6020012', 'mol_weight': 180.207},
 {'dsstox_sid': 'DTXSID7020001', 'mol_weight': 183.214},
 {'dsstox_sid': 'DTXSID7020005', 'mol_weight': 59.068},
 {'dsstox_sid': 'DTXSID7020007', 'mol_weight': 324.4},
 {'dsstox_sid': 'DTXSID7020009', 'mol_weight': 41.053}]

In [124]:

weight_dict={record['dsstox_sid']:record['mol_weight'] for record in loael_weight}

In [125]:
weight_dict

{'DTXSID1020013': 179.179,
 'DTXSID2020004': 59.068,
 'DTXSID2020006': 151.165,
 'DTXSID2020008': 266.28,
 'DTXSID6020010': 73.095,
 'DTXSID6020012': 180.207,
 'DTXSID7020001': 183.214,
 'DTXSID7020005': 59.068,
 'DTXSID7020007': 324.4,
 'DTXSID7020009': 41.053}

In [126]:
sids=set(logp_dict.keys())&set(ws_dict.keys())&set(weight_dict.keys())&set(smiles_dict.keys())

In [127]:
sids

{'DTXSID1020013',
 'DTXSID2020004',
 'DTXSID2020006',
 'DTXSID2020008',
 'DTXSID6020010',
 'DTXSID6020012',
 'DTXSID7020001',
 'DTXSID7020005',
 'DTXSID7020007',
 'DTXSID7020009'}

In [128]:

records=[]
for sid in sids:
    records.append({'dsstox_sid':sid,'smiles':smiles_dict[sid],'logp':logp_dict[sid],'ws':ws_dict[sid],'mol_weight':weight_dict[sid],'mol':Chem.MolFromSmiles(smiles_dict[sid])})
records=[record for record in records if record['mol']]

In [129]:
records[0]

{'dsstox_sid': 'DTXSID2020006',
 'smiles': 'CC(=O)NC1=CC=C(O)C=C1',
 'logp': 0.875449,
 'ws': 1.11026,
 'mol_weight': 151.165,
 'mol': <rdkit.Chem.rdchem.Mol at 0x7cf74c146760>}

In [130]:
import math
for record in records:
    #print(record)
    epa_categories=sorted([category for category,test in new_tests.items() if test(record)])
    if 'Neutral Organics' in epa_categories and len(epa_categories)>1:
        epa_categories.remove('Neutral Organics')
    record['categories']=tuple(epa_categories)
    

In [131]:
pd.DataFrame(records)

Unnamed: 0,dsstox_sid,smiles,logp,ws,mol_weight,mol,categories
0,DTXSID2020006,CC(=O)NC1=CC=C(O)C=C1,0.875449,1.11026,151.165,<rdkit.Chem.rdchem.Mol object at 0x7cf74c146760>,"(Phenols (Acute toxicity),)"
1,DTXSID7020007,CC(=O)C1=CC=C(C=C1)S(=O)(=O)NC(=O)NC1CCCCC1,1.65144,0.001395,324.4,<rdkit.Chem.rdchem.Mol object at 0x7cf74c146620>,()
2,DTXSID7020001,NC1=NC2=C(C=C1)C1=CC=CC=C1N2,2.63969,0.015397,183.214,<rdkit.Chem.rdchem.Mol object at 0x7cf74c148b20>,()
3,DTXSID1020013,CC(=O)NNC(=O)C1=CC=NC=C1,-0.577089,0.380852,179.179,<rdkit.Chem.rdchem.Mol object at 0x7cf74c148f80>,"(Hydrazines and Related Compounds,)"
4,DTXSID6020010,CC(C)=NO,0.882667,1.69356,73.095,<rdkit.Chem.rdchem.Mol object at 0x7cf74c148ad0>,()
5,DTXSID2020004,CC=NO,0.267325,5.8831,59.068,<rdkit.Chem.rdchem.Mol object at 0x7cf74c148c10>,()
6,DTXSID7020009,CC#N,-0.316894,8.59913,41.053,<rdkit.Chem.rdchem.Mol object at 0x7cf74c1428a0>,()
7,DTXSID2020008,CC(C)=NNC1=NC=C(S1)C1=CC=C(O1)[N+]([O-])=O,1.54363,3.3e-05,266.28,<rdkit.Chem.rdchem.Mol object at 0x7cf74c142030>,"(Hydrazines and Related Compounds,)"
8,DTXSID7020005,CC(N)=O,-1.2057,19.0029,59.068,<rdkit.Chem.rdchem.Mol object at 0x7cf74c142080>,()
9,DTXSID6020012,CC(=O)NNC1=CC=C(CO)C=C1,0.764281,0.686068,180.207,<rdkit.Chem.rdchem.Mol object at 0x7cf74c142760>,"(Hydrazines and Related Compounds,)"


In [132]:
tsca_df = pd.read_excel(interim_dir+'TSCA_OPERA_predictions_080722.xlsx')

In [133]:
records[0].keys()

dict_keys(['dsstox_sid', 'smiles', 'logp', 'ws', 'mol_weight', 'mol', 'categories'])

In [134]:
tsca_df2 = tsca_df[tsca_df['errors'].isnull()]

In [135]:
tsca_df2 = tsca_df2.set_index('dtxsid')

In [136]:
tsca_df2.columns

Index(['Unnamed: 0', 'PREFERRED_NAME', 'CASRN', 'INCHIKEY', 'IUPAC_NAME',
       'SMILES', 'INCHI_STRING', 'MOLECULAR_FORMULA', 'AVERAGE_MASS',
       'MONOISOTOPIC_MASS', 'DATA_SOURCES', 'NUMBER_OF_PUBMED_ARTICLES',
       'PUBCHEM_DATA_SOURCES', 'CPDAT_COUNT', 'MolWeight', 'nbAtoms',
       'nbHeavyAtoms', 'nbC', 'nbO', 'nbN', 'nbAromAtom', 'nbRing',
       'nbHeteroRing', 'Sp3Sp2HybRatio', 'nbRotBd', 'nbHBdAcc', 'ndHBdDon',
       'nbLipinskiFailures', 'TopoPolSurfAir', 'MolarRefract',
       'CombDipolPolariz', 'LogP_pred', 'LogP_predRange', 'AD_LogP',
       'AD_index_LogP', 'Conf_index_LogP', 'MP_pred', 'MP_predRange', 'AD_MP',
       'AD_index_MP', 'Conf_index_MP', 'BP_pred', 'BP_predRange', 'AD_BP',
       'AD_index_BP', 'Conf_index_BP', 'LogVP_pred', 'VP_predRange', 'AD_VP',
       'AD_index_VP', 'Conf_index_VP', 'LogWS_pred', 'WS_predRange', 'AD_WS',
       'AD_index_WS', 'Conf_index_WS', 'LogHL_pred', 'HL_predRange', 'AD_HL',
       'AD_index_HL', 'Conf_index_HL', 'RT_pred',

In [137]:
test_df = tsca_df2[['qsar_ready_smiles','LogP_pred','WS_pred_mol/L','MolWeight']].head()

In [138]:
test_df = test_df.reset_index()

In [139]:
test_df

Unnamed: 0,dtxsid,qsar_ready_smiles,LogP_pred,WS_pred_mol/L,MolWeight
0,DTXSID3060164,C1=CC=CC=C1C(C1C=CC=CC=1)C1C=CC=CC=1,5.76,4.073803e-07,244.125201
1,DTXSID7060837,ICCCI,3.02,0.0007413102,295.855896
2,DTXSID9025879,OC(=O)C=CC1C=CC(C=CC(O)=O)=CC=1,1.99,0.009120108,218.057909
3,DTXSID2026282,O=C(NC1=CC2=C(NC3C4=C(C=CC2=3)C(=O)C2C(=CC=CC=...,3.11,2.454709e-08,667.174336
4,DTXSID4052188,CC1(C)COC(C)(OC1)C1C=CC=CC=1,2.72,0.001148154,206.13068


In [140]:
test_df.columns = ['dsstox_sid', 'smiles', 'logp', 'ws', 'mol_weight']

In [141]:
test_df['mol'] = [Chem.MolFromSmiles(e) for e in test_df['smiles']]

In [142]:
test_df

Unnamed: 0,dsstox_sid,smiles,logp,ws,mol_weight,mol
0,DTXSID3060164,C1=CC=CC=C1C(C1C=CC=CC=1)C1C=CC=CC=1,5.76,4.073803e-07,244.125201,<rdkit.Chem.rdchem.Mol object at 0x7cf7474a37b0>
1,DTXSID7060837,ICCCI,3.02,0.0007413102,295.855896,<rdkit.Chem.rdchem.Mol object at 0x7cf7474b07b0>
2,DTXSID9025879,OC(=O)C=CC1C=CC(C=CC(O)=O)=CC=1,1.99,0.009120108,218.057909,<rdkit.Chem.rdchem.Mol object at 0x7cf7474b0030>
3,DTXSID2026282,O=C(NC1=CC2=C(NC3C4=C(C=CC2=3)C(=O)C2C(=CC=CC=...,3.11,2.454709e-08,667.174336,<rdkit.Chem.rdchem.Mol object at 0x7cf7474b0620>
4,DTXSID4052188,CC1(C)COC(C)(OC1)C1C=CC=CC=1,2.72,0.001148154,206.13068,<rdkit.Chem.rdchem.Mol object at 0x7cf7474b0990>


In [143]:
test_dict = test_df.to_dict('records')

In [144]:
test_dict

[{'dsstox_sid': 'DTXSID3060164',
  'smiles': 'C1=CC=CC=C1C(C1C=CC=CC=1)C1C=CC=CC=1',
  'logp': 5.76,
  'ws': 4.07380277804113e-07,
  'mol_weight': 244.125200512,
  'mol': <rdkit.Chem.rdchem.Mol at 0x7cf7474a37b0>},
 {'dsstox_sid': 'DTXSID7060837',
  'smiles': 'ICCCI',
  'logp': 3.02,
  'ws': 0.0007413102413009177,
  'mol_weight': 295.855896192,
  'mol': <rdkit.Chem.rdchem.Mol at 0x7cf7474b07b0>},
 {'dsstox_sid': 'DTXSID9025879',
  'smiles': 'OC(=O)C=CC1C=CC(C=CC(O)=O)=CC=1',
  'logp': 1.99,
  'ws': 0.009120108393559097,
  'mol_weight': 218.0579088,
  'mol': <rdkit.Chem.rdchem.Mol at 0x7cf7474b0030>},
 {'dsstox_sid': 'DTXSID2026282',
  'smiles': 'O=C(NC1=CC2=C(NC3C4=C(C=CC2=3)C(=O)C2C(=CC=CC=2C4=O)NC(=O)C2C=CC=CC=2)C2=C1C(=O)C1C=CC=CC=1C2=O)C1C=CC=CC=1',
  'logp': 3.11,
  'ws': 2.454708915685029e-08,
  'mol_weight': 667.174335520001,
  'mol': <rdkit.Chem.rdchem.Mol at 0x7cf7474b0620>},
 {'dsstox_sid': 'DTXSID4052188',
  'smiles': 'CC1(C)COC(C)(OC1)C1C=CC=CC=1',
  'logp': 2.72,
  'ws': 0

In [145]:
for record in test_dict:
    #print(record)
    epa_categories=sorted([category for category,test in new_tests.items() if test(record)])
    if 'Neutral Organics' in epa_categories and len(epa_categories)>1:
        epa_categories.remove('Neutral Organics')
    record['categories']=tuple(epa_categories)
    

In [146]:
pd.DataFrame(test_dict)

Unnamed: 0,dsstox_sid,smiles,logp,ws,mol_weight,mol,categories
0,DTXSID3060164,C1=CC=CC=C1C(C1C=CC=CC=1)C1C=CC=CC=1,5.76,4.073803e-07,244.125201,<rdkit.Chem.rdchem.Mol object at 0x7cf7474a37b0>,"(Neutral Organics,)"
1,DTXSID7060837,ICCCI,3.02,0.0007413102,295.855896,<rdkit.Chem.rdchem.Mol object at 0x7cf7474b07b0>,()
2,DTXSID9025879,OC(=O)C=CC1C=CC(C=CC(O)=O)=CC=1,1.99,0.009120108,218.057909,<rdkit.Chem.rdchem.Mol object at 0x7cf7474b0030>,()
3,DTXSID2026282,O=C(NC1=CC2=C(NC3C4=C(C=CC2=3)C(=O)C2C(=CC=CC=...,3.11,2.454709e-08,667.174336,<rdkit.Chem.rdchem.Mol object at 0x7cf7474b0620>,()
4,DTXSID4052188,CC1(C)COC(C)(OC1)C1C=CC=CC=1,2.72,0.001148154,206.13068,<rdkit.Chem.rdchem.Mol object at 0x7cf7474b0990>,"(Neutral Organics,)"


In [153]:
tsca_df2 = tsca_df2.reset_index()

In [154]:
epa_cats_df = tsca_df2[['dtxsid','qsar_ready_smiles','LogP_pred','WS_pred_mol/L','MolWeight']]

In [155]:
epa_cats_df

Unnamed: 0,dtxsid,qsar_ready_smiles,LogP_pred,WS_pred_mol/L,MolWeight
0,DTXSID3060164,C1=CC=CC=C1C(C1C=CC=CC=1)C1C=CC=CC=1,5.76,4.073803e-07,244.125201
1,DTXSID7060837,ICCCI,3.02,7.413102e-04,295.855896
2,DTXSID9025879,OC(=O)C=CC1C=CC(C=CC(O)=O)=CC=1,1.99,9.120108e-03,218.057909
3,DTXSID2026282,O=C(NC1=CC2=C(NC3C4=C(C=CC2=3)C(=O)C2C(=CC=CC=...,3.11,2.454709e-08,667.174336
4,DTXSID4052188,CC1(C)COC(C)(OC1)C1C=CC=CC=1,2.72,1.148154e-03,206.130680
...,...,...,...,...,...
13294,DTXSID5064209,OC(=O)CSC1=NC2=CC=CC=C2S1,1.54,9.772372e-04,224.991820
13295,DTXSID7062873,COC(=O)CCCCl,1.55,2.041738e-01,136.029107
13296,DTXSID9040342,CCC1COC(=O)O1,0.03,9.772372e-01,116.047344
13297,DTXSID3044889,CC(C)(C)C1=CC(=O)C(=CC1=O)C(C)(C)C,4.26,3.981072e-03,220.146330


In [156]:
epa_cats_df.columns = ['dsstox_sid', 'smiles', 'logp', 'ws', 'mol_weight']

In [157]:
epa_cats_df['mol'] = [Chem.MolFromSmiles(e) for e in epa_cats_df['smiles']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  epa_cats_df['mol'] = [Chem.MolFromSmiles(e) for e in epa_cats_df['smiles']]


In [158]:
epa_cats_df[epa_cats_df['mol'].isnull()]

Unnamed: 0,dsstox_sid,smiles,logp,ws,mol_weight,mol


In [159]:
epa_cats_df.dsstox_sid.nunique()

13299

In [160]:
epa_cats_df.shape

(13299, 6)

In [161]:
epa_cats_dict = epa_cats_df.to_dict('records')

In [168]:
new_tests

{'Aliphatic Amines': <function __main__.create_test.<locals>.test(x)>,
 'Alkoxysilanes': <function __main__.create_test.<locals>.test(x)>,
 'Aminobenzothiazole Azo Dyes': <function __main__.create_test.<locals>.test(x)>,
 'Anionic Surfactants': <function __main__.create_test.<locals>.test(x)>,
 'Benzotriazoles': <function __main__.create_test.<locals>.test(x)>,
 'Dianilines': <function __main__.create_test.<locals>.test(x)>,
 'Dithiocarbamates (Acute toxicity)': <function __main__.create_test.<locals>.test(x)>,
 'Dithiocarbamates (Chronic toxicity)': <function __main__.create_test.<locals>.test(x)>,
 'Ethylene Glycol Ethers': <function __main__.create_test.<locals>.test(x)>,
 'Neutral Organics': <function __main__.create_test.<locals>.test(x)>,
 'Nonionic Surfactants': <function __main__.create_test.<locals>.test(x)>,
 'Organotins (Acute toxicity)': <function __main__.create_test.<locals>.test(x)>,
 'Organotins (Chronic toxicity)': <function __main__.create_test.<locals>.test(x)>,
 'Po

In [162]:
for record in epa_cats_dict:
    #print(record)
    epa_categories=sorted([category for category,test in new_tests.items() if test(record)])
    if 'Neutral Organics' in epa_categories and len(epa_categories)>1:
        epa_categories.remove('Neutral Organics')
    record['categories']=tuple(epa_categories)

In [163]:
epa_cats_dict_df = pd.DataFrame(epa_cats_dict)

In [164]:
epa_cats_dict_df.columns

Index(['dsstox_sid', 'smiles', 'logp', 'ws', 'mol_weight', 'mol',
       'categories'],
      dtype='object')

In [165]:
epa_cats_dict_df['categories'].value_counts()

()                                                                   5699
(Neutral Organics,)                                                  2667
(Esters (Acute toxicity),)                                           1310
(Phenols (Acute toxicity),)                                           502
(Anilines (Acute toxicity),)                                          407
                                                                     ... 
(Aldehydes (Chronic toxicity), Phenols (Acute toxicity))                1
(Substituted Triazines (Acute toxicity), Thiols (Acute toxicity))       1
(Aldehydes (Acute toxicity), Nonionic Surfactants)                      1
(Azides (Chronic toxicity),)                                            1
(Acrylamides, Cationic (quaternary ammonium) surfactants)               1
Name: categories, Length: 143, dtype: int64

In [166]:
epa_dict = {k:v for k,v in zip(epa_cats_dict_df['dsstox_sid'],epa_cats_dict_df['categories'] )}

In [167]:
epa_dict = {k:v for k,v in epa_dict.items() if len(v) != 0}

#### Add NCC tag to the main TSCA set

In [71]:
tsca_df = tsca_df.drop('Unnamed: 0', axis = 1)

In [72]:
tsca_df['NCC'] = tsca_df['dtxsid'].apply(lambda x : epa_dict[x] if x in epa_dict.keys() else np.nan) 
        

In [73]:
tsca_df['NCC'].value_counts(dropna = False)

NaN                                                                                                6647
(Neutral Organics,)                                                                                2667
(Esters (Acute toxicity),)                                                                         1310
(Phenols (Acute toxicity),)                                                                         502
(Anilines (Acute toxicity),)                                                                        407
                                                                                                   ... 
(Aldehydes (Chronic toxicity), Phenols (Acute toxicity))                                              1
(Hydrazines and Related Compounds, Polynitroaromatics (Acute toxicity))                               1
(Substituted Triazines (Acute toxicity), Thiols (Acute toxicity))                                     1
(Alkoxysilanes, Anilines (Acute toxicity))                      

In [76]:
tsca_df.dtxsid.nunique()

14247

In [77]:
tsca_df.shape

(14247, 94)

#### Tag DTXSID by vendor number to give an indicator of procurability

In [78]:
vendors = pd.read_excel(raw_dir+'TSCA_Active_ChemicalVendors_Summary_0218_2022.xlsx')

In [79]:
vendors.rename(columns = {'DTXSID': 'dtxsid'}, inplace = True)

In [80]:
vendors.columns

Index(['dtxsid', 'PREFERRED_NAME', 'CASRN', 'INCHIKEY', 'PUBCHEM_CID',
       'VENDOR_COUNT', 'IN_SIGMA-ALDRICH'],
      dtype='object')

In [85]:
vendors = vendors[vendors['dtxsid'].isin(tsca_df['dtxsid'].tolist())]

In [87]:
vendors.shape

(14726, 7)

In [88]:
vendors.dtxsid.nunique()

13996

In [97]:
vendors2 = vendors.groupby('dtxsid').agg({'VENDOR_COUNT': 'mean', 'IN_SIGMA-ALDRICH': 'mean'}).reset_index()

In [98]:
vendors2

Unnamed: 0,dtxsid,VENDOR_COUNT,IN_SIGMA-ALDRICH
0,DTXSID001007489,0.0,0.0
1,DTXSID001014212,,
2,DTXSID001014274,0.0,0.0
3,DTXSID001014286,,
4,DTXSID001014393,5.0,0.0
...,...,...,...
13991,DTXSID90979094,8.0,0.0
13992,DTXSID90988262,0.0,0.0
13993,DTXSID90988560,3.0,0.0
13994,DTXSID90997177,,


In [101]:
final_input_df = pd.merge(tsca_df, vendors2[['dtxsid','VENDOR_COUNT', 'IN_SIGMA-ALDRICH']],  on = 'dtxsid', how = 'left')

In [102]:
final_input_df

Unnamed: 0,dtxsid,PREFERRED_NAME,CASRN,INCHIKEY,IUPAC_NAME,SMILES,INCHI_STRING,MOLECULAR_FORMULA,AVERAGE_MASS,MONOISOTOPIC_MASS,...,errors,qsar_ready_smiles,HLC,WS_pred_mol/L,WS_pred_mg/L,test_track,physical_form,NCC,VENDOR_COUNT,IN_SIGMA-ALDRICH
0,DTXSID3060164,Benzhydrylbenzene,519-73-3,AAAQKTZKLRYKHR-UHFFFAOYSA-N,"1,1',1''-Methanetriyltribenzene",C1=CC=C(C=C1)C(C1=CC=CC=C1)C1=CC=CC=C1,InChI=1S/C19H16/c1-4-10-16(11-5-1)19(17-12-6-2...,C19H16,244.337,244.125201,...,,C1=CC=CC=C1C(C1C=CC=CC=1)C1C=CC=CC=1,2.630268e-06,4.073803e-07,0.099452,B,solid,"(Neutral Organics,)",75.0,1.0
1,DTXSID7060837,"Propane, 1,3-diiodo-",627-31-6,AAAXMNYUNVCMCJ-UHFFFAOYSA-N,"1,3-Diiodopropane",ICCCI,InChI=1S/C3H6I2/c4-2-1-3-5/h1-3H2,C3H6I2,295.890,295.855890,...,,ICCCI,2.454709e-03,7.413102e-04,219.321006,B,liquid,,72.0,1.0
2,DTXSID9025879,"3,3'-(p-Phenylene)diacrylic acid",16323-43-6,AAFXQFIGKBLKMC-UHFFFAOYSA-N,"3,3'-(1,4-Phenylene)di(prop-2-enoic acid)",OC(=O)C=CC1=CC=C(C=CC(O)=O)C=C1,InChI=1S/C12H10O4/c13-11(14)7-5-9-1-2-10(4-3-9...,C12H10O4,218.208,218.057909,...,,OC(=O)C=CC1C=CC(C=CC(O)=O)=CC=1,6.025596e-10,9.120108e-03,1988.711764,B,solid,,13.0,0.0
3,DTXSID2026282,C.I. Vat brown 3,131-92-0,AAKMSGQPNUGLAZ-UHFFFAOYSA-N,"N,N'-(5,10,15,17-Tetraoxo-10,15,16,17-tetrahyd...",O=C(NC1=CC=CC2=C1C(=O)C1=C(C3=C(C=C1)C1=C(N3)C...,InChI=1S/C42H23N3O6/c46-37-24-14-7-8-15-25(24)...,C42H23N3O6,665.661,665.158685,...,,O=C(NC1=CC2=C(NC3C4=C(C=CC2=3)C(=O)C2C(=CC=CC=...,2.630268e-09,2.454709e-08,0.016377,B,solid,,13.5,0.0
4,DTXSID4052188,"2,5,5-Trimethyl-2-phenyl-1,3-dioxane",5406-58-6,AALXTPRRKXUUOM-UHFFFAOYSA-N,"2,5,5-Trimethyl-2-phenyl-1,3-dioxane",CC1(C)COC(C)(OC1)C1=CC=CC=C1,"InChI=1S/C13H18O2/c1-12(2)9-14-13(3,15-10-12)1...",C13H18O2,206.285,206.130680,...,,CC1(C)COC(C)(OC1)C1C=CC=CC=1,9.772372e-05,1.148154e-03,236.669687,B,solid,"(Neutral Organics,)",11.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14242,DTXSID5064209,2-(Carboxymethylthio)benzothiazole,6295-57-4,ZZUQWNYNSKJLPI-UHFFFAOYSA-N,"[(1,3-Benzothiazol-2-yl)sulfanyl]acetic acid",OC(=O)CSC1=NC2=CC=CC=C2S1,InChI=1S/C9H7NO2S2/c11-8(12)5-13-9-10-6-3-1-2-...,C9H7NO2S2,225.280,224.991821,...,,OC(=O)CSC1=NC2=CC=CC=C2S1,3.388442e-09,9.772372e-04,219.870381,B,solid,,79.0,1.0
14243,DTXSID7062873,"Butanoic acid, 4-chloro-, methyl ester",3153-37-5,ZZUYIRISBMWFMV-UHFFFAOYSA-N,Methyl 4-chlorobutanoate,COC(=O)CCCCl,"InChI=1S/C5H9ClO2/c1-8-5(7)3-2-4-6/h2-4H2,1H3",C5H9ClO2,136.580,136.029107,...,,COC(=O)CCCCl,2.818383e-04,2.041738e-01,27773.578977,B,liquid,"(Esters (Acute toxicity),)",71.0,1.0
14244,DTXSID9040342,"4-Ethyl-1,3-dioxolan-2-one",4437-85-8,ZZXUZKXVROWEIF-UHFFFAOYSA-N,"4-Ethyl-1,3-dioxolan-2-one",CCC1COC(=O)O1,"InChI=1/C5H8O3/c1-2-4-3-7-5(6)8-4/h4H,2-3H2,1H3",C5H8O3,116.116,116.047344,...,,CCC1COC(=O)O1,2.570396e-08,9.772372e-01,113405.784063,B,liquid,"(Neutral Organics,)",49.0,1.0
14245,DTXSID3044889,"2,5-Di-tert-butylcyclohexa-2,5-diene-1,4-dione",2460-77-7,ZZYASVWWDLJXIM-UHFFFAOYSA-N,"2,5-Di-tert-butylcyclohexa-2,5-diene-1,4-dione",CC(C)(C)C1=CC(=O)C(=CC1=O)C(C)(C)C,"InChI=1S/C14H20O2/c1-13(2,3)9-7-12(16)10(8-11(...",C14H20O2,220.312,220.146330,...,,CC(C)(C)C1=CC(=O)C(=CC1=O)C(C)(C)C,5.370318e-04,3.981072e-03,876.418325,B,solid,,61.0,1.0


In [103]:
writer = pd.ExcelWriter(interim_dir+'tsca_struct_input_080722.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.

final_input_df.to_excel(writer)

writer.save()
