# Creating the starting inventory and identifying the TSCA substances for degradation

- Created by: Grace Patlewicz
- Last date modified: 30 April 2024
- Merging OPERA predictions and inventory with PFAS Atlas assignments. Identifying TSCA substances for which degradation products need to be identified

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage

#Show mols in dataframes
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage
from IPython.core.display import HTML
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
import os
from scipy.spatial.distance import pdist, squareform


In [2]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
processed_dir = TOP + 'data/processed/'
figures_dir = TOP + 'reports/figures/'

Import new starting inventory prepared by AW and reviewed by TB et al

In [5]:
df = pd.read_excel(raw_dir+'DSSTox_PFAS8a7v3_20240426.xlsx')

  warn("Workbook contains no default style, apply openpyxl's default")


In [9]:
df = df[[ 'DSSTox_Substance_Id','DSSTox_QC-Level', 'Substance_Name',
       'Substance_CASRN', 'Substance_Type', 'Substance_Note',
       'Structure_SMILES', 'Structure_InChI', 'Structure_InChIKey',
       'Structure_Formula', 'Structure_MolWt', 'Structure_SMILES_2D-QSAR']]

In [10]:
df.shape

(13054, 12)

In [11]:
df.rename(columns = {'DSSTox_Substance_Id' : 'dtxsid',  'Structure_SMILES':'smiles'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'DSSTox_Substance_Id' : 'dtxsid',  'Structure_SMILES':'smiles'}, inplace = True)


There are 13,054 substances on this inventory represemted by 13,042 unique SMILES.

In [14]:
df.smiles.nunique()

13042

Pulling in the TSCA chemicals downloaded from public Dashboard 30 April 2024

In [18]:
tsca_active = pd.read_excel(raw_dir+'Chemical List TSCA_ACTIVE_NCTI_0224-2024-04-30.xlsx')

In [19]:
tsca_inactive = pd.read_excel(raw_dir+'Chemical List TSCA_INACTIVE_NCTI_0224-2024-04-30.xlsx')

In [22]:
inactive_dtx = tsca_inactive['DTXSID'].tolist()

In [23]:
active_dtx = tsca_active['DTXSID'].tolist()

In [29]:
len(inactive_dtx)

34387

In [31]:
#df['dtxsid']

In [34]:
df['TSCA_STATUS'] = df['dtxsid'].apply(lambda x: 'active' if x in active_dtx else ('inactive' if x in inactive_dtx else np.nan))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['TSCA_STATUS'] = df['dtxsid'].apply(lambda x: 'active' if x in active_dtx else ('inactive' if x in inactive_dtx else np.nan))


In [36]:
df['TSCA_STATUS'].value_counts()

inactive    326
active      237
Name: TSCA_STATUS, dtype: int64

There are 326 inactives and 237 actives on the TSCA inventory that are PFAS.

Pulling in the OPERA predictions and the PFAS Atlas assignments

In [37]:
opera = pd.read_csv(raw_dir+'DSSTox_PFAS8a7v3_20240426_V2000-sdf_OPERA2.9Pred.csv')

In [39]:
opera.columns

Index(['MoleculeID', 'MolWeight', 'nbAtoms', 'nbHeavyAtoms', 'nbC', 'nbO',
       'nbN', 'nbAromAtom', 'nbRing', 'nbHeteroRing',
       ...
       'Conf_index_CoMPARA_Bind', 'CATMoS_VT_pred', 'CATMoS_NT_pred',
       'CATMoS_EPA_pred', 'CATMoS_GHS_pred', 'CATMoS_LD50_pred',
       'CATMoS_LD50_predRange', 'AD_CATMoS', 'AD_index_CATMoS',
       'Conf_index_CATMoS'],
      dtype='object', length=150)

In [40]:
opera.shape

(12781, 150)

In [41]:
opera.rename(columns = {'MoleculeID': 'dtxsid'}, inplace = True)

In [44]:
df1 = pd.merge(df, opera, on = 'dtxsid', how = 'left')

In [51]:
atlas = pd.read_csv(external_dir+'new_inv_pfas_atlas_300424.csv', index_col=[0])

In [54]:
atlas = atlas[['Canonical_QSARr', 'Molecule name','First_Class', 'Second_Class' ]]

In [56]:
atlas.rename(columns = {'Canonical_QSARr': 'QSAR_READY_SMILES', 'Molecule name': 'dtxsid'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  atlas.rename(columns = {'Canonical_QSARr': 'QSAR_READY_SMILES', 'Molecule name': 'dtxsid'}, inplace = True)


In [60]:
atlas = atlas[['dtxsid', 'First_Class', 'Second_Class','QSAR_READY_SMILES']]

In [62]:
df1 = pd.merge(df1, atlas, on = 'dtxsid', how = 'left')

In [69]:
df1[df1['TSCA_STATUS'].notnull()].to_csv(interim_dir+'april2024_tsca_pfas.csv')