# EPA CompTox Chemistry Dashboard

Lookup CAS numbers in EPA CompTox Dashboard.

See: https://comptox.epa.gov/dashboard.

In [1]:
import logging

import pandas as pd

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import PandasTools
from rdkit.Chem import Descriptors
PandasTools.RenderImagesInAllDataFrames()

from tqdm import tqdm_notebook
tqdm_notebook().pandas()




In [2]:
# Set up a logger object...

logger = logging.getLogger(__name__)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('[%(asctime)s %(name)s %(levelname)s] %(message)s', datefmt='%d/%m/%y %H:%M:%S'))
logger.addHandler(handler)
logger.setLevel(logging.INFO)

In [3]:
from EPA import comptox_lookup, comptox_link

### Config

### Reload CAS numbers

In [11]:
#cas_numbers = open('cas_numbers.txt', 'r').read().strip().split('\n')
filename = "C:\\Users\\bet\\Documents\\Inditex\\Compound inventory\\Annotated compounds\\REACH\\AnnexVI_inventory\\20170516\\annex_vi_clp_table_en.split.TallAndSkinny.tsv"
with open(filename, encoding='utf8') as infile:
    cas_numbers = set([x.split('\t')[4] for x in infile])

len(cas_numbers)

3962

### Lookup CAS numbers

In [13]:
records = [y for y in (comptox_lookup(x) for x in tqdm_notebook(cas_numbers)) if y is not None]

len(records)

CAS number '': problem extracting name.
CAS number '' (name = 'None'): no structure available.
CAS number '68919-02-8' (name = 'Gases, petroleum, fluidized catalytic cracker fractionation off'): no structure available.
CAS number '64741-65-7' (name = 'Naphtha, petroleum, heavy alkylate'): no structure available.
CAS number '68308-03-2' (name = 'Tail gas, petroleum, gas oil catalytic cracking absorber'): no structure available.
CAS number '8065-36-9' (name = 'Bufencarb'): no structure available.
CAS number '70592-77-7' (name = 'Distillates, petroleum, light vacuum'): no structure available.
CAS number '68476-32-4' (name = 'Fuel oil, residues-straight-run gas oils, high-sulfur'): no structure available.
CAS number '68919-39-1' (name = 'Natural gas condensates'): no structure available.
CAS number '68308-10-1' (name = 'Tail gas, petroleum, straight-run distillate hydrodesulfurizer, hydrogen sulfide-free'): no structure available.
CAS number '65996-92-1' (name = 'Distillates, coal tar'): n

CAS number '64741-81-7' (name = 'Distillates, petroleum, heavy thermal cracked'): no structure available.
CAS number '65996-86-3' (name = 'Extract oils, coal, tar base'): no structure available.
CAS number '68477-34-9' (name = 'Distillates, petroleum, C3-5, 2-methyl-2-butene-rich'): no structure available.
CAS number '72623-85-9' (name = 'Lubricating oils, petroleum, C20-50, hydrotreated neutral oil-based, high-viscosity'): no structure available.
CAS number '65996-87-4' (name = 'Extract residues, coal, tar oil alk.'): no structure available.
CAS number '64742-28-5' (name = 'Distillates (petroleum), chemically neutralized light paraffinic'): no structure available.
CAS number '12510-42-8' (name = 'Erionite'): no structure available.
CAS number '64741-51-1' (name = 'Distillates, petroleum, heavy paraffinic'): no structure available.
CAS number '92062-15-2' (name = 'Solvent naphtha, (petroleum), hydrotreated light, naphthenic'): no structure available.
CAS number '139189-30-3' (name = 'P

CAS number '68410-98-0' (name = 'Distillates, petroleum, hydrotreated heavy naphtha, deisohexanizer overheads'): no structure available.
CAS number '64741-91-9' (name = 'Distillates, petroleum, solvent-refined middle'): no structure available.
CAS number '64742-19-4' (name = 'Distillates, petroleum, acid-treated light naphthenic'): no structure available.
CAS number '68513-66-6' (name = 'Residues, petroleum, alkylation splitter, C4-rich'): no structure available.
CAS number '11138-47-9' (name = 'Sodium perborate'): no structure available.
CAS number '64741-52-2' (name = 'Distillates, petroleum, light naphthenic'): no structure available.
CAS number '91465-08-6' (name = 'lambda-Cyhalothrin'): will regenerate SMILES from conntab.
CAS number '68334-30-5' (name = 'Fuels, diesel'): no structure available.
CAS number '64742-89-8' (name = 'Solvent naphtha, petroleum, light aliph.'): no structure available.
CAS number '68475-58-1' (name = 'Alkanes, C2-3'): no structure available.
CAS number '6

CAS number '61789-60-4' (name = 'Pitch'): no structure available.
CAS number '68513-03-1' (name = 'Naphtha, petroleum, light catalytic reformed, arom.-free'): no structure available.
CAS number '68477-94-1' (name = 'Gases, petroleum, gas recovery plant depropanizer overheads'): no structure available.
CAS number '68814-89-1' (name = 'Extracts (petroleum), heavy paraffinic distillates, solvent-deasphalted'): no structure available.
CAS number '68410-97-9' (name = 'Distillates, petroleum, light distillate hydrotreating process, low-boiling'): no structure available.
CAS number '68989-88-8' (name = 'Gases, petroleum, crude distn. and catalytic cracking'): no structure available.
CAS number '64742-61-6' (name = 'Slack wax, petroleum'): no structure available.
CAS number '26523-78-4' (name = 'Nonylphenolphosphite(3:1)'): no structure available.
CAS number '68478-32-0' (name = 'Tail gas, petroleum, saturate gas plant mixed stream, C4-rich'): no structure available.
CAS number '14507-36-9' (n

2281

In [50]:
comptox_df = pd.DataFrame(records)

comptox_df.shape

(2281, 7)

In [51]:
# Check for missing names or structures...

for x in ('name', 'inchi', 'inchikey', 'smiles'): assert (comptox_df[x].isnull() | (comptox_df[x] == '')).sum() == 0

In [52]:
# Add link to CompTox dashboard...

comptox_df['report'] = comptox_df['cas'].apply(comptox_link)

In [53]:
# Add RDKit molecule...

### PandasTools.AddMoleculeColumnToFrame(comptox_df, smilesCol='smiles', molCol='mol')

comptox_df['mol'] = [Chem.MolFromMolBlock(x) for x in comptox_df['conntab']]
comptox_df['MW'] = [Descriptors.MolWt(x) if not x == None else None for x in comptox_df['mol']]
comptox_df['logP'] = [Descriptors.MolLogP(x) if not x == None else None for x in comptox_df['mol']]
comptox_df['TPSA'] = [Descriptors.TPSA(x) if not x == None else None for x in comptox_df['mol']]

In [71]:
comptox_df.drop(['inchi', 'conntab', 'synonyms', 'mol'], axis=1).head()

ValueError: labels ['inchi' 'conntab' 'synonyms' 'mol'] not contained in axis

In [74]:
outFile = "C:\\Users\\bet\\Documents\\Inditex\\Compound inventory\\Annotated compounds\\REACH\\AnnexVI_inventory\\20170516\\annex_vi_clp_table_en.split.TallAndSkinny.smi_desc.tsv"
outF = open(outFile, 'w')
comptox_df.drop(['inchi', 'conntab', 'synonyms', 'mol'], axis=1, inplace=True)
comptox_df.to_csv(outFile, sep='\t', float_format='%.2f', index=False)
outF.close()