# Processing biodegradation predictions generated by Catalogic to extract out the TSCA simulated degradates

- Created by: Grace Patlewicz
- Last modified: 30 April 2024
- Change: Processing Catalogic predictions to extract out unique degradation products generated by substances on the TSCA inventory

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage

#Show mols in dataframes
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage
from IPython.core.display import HTML
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
import os
from scipy.spatial.distance import pdist, squareform

In [2]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
processed_dir = TOP + 'data/processed/'
figures_dir = TOP + 'reports/figures/'

In [3]:
df = pd.read_csv(raw_dir+'catalogic_output_260424.txt', sep = '\t')

  df = pd.read_csv(raw_dir+'catalogic_output_260424.txt', sep = '\t')


In [4]:
df['Parent'] = df['Chemical name']

In [5]:
df['dtxsid'] = df['Parent']

In [6]:
df.Parent.nunique()

13000

In [7]:
df['Parent'] = df['Parent'].replace(np.nan).ffill()

In [8]:
df.Parent.nunique()

13000

In [9]:
df.shape

(1408276, 23)

In [10]:
df['Parent_id'] = df['dtxsid'].apply(lambda x: 'Parent' if pd.notnull((x)) else np.nan)

In [11]:
df.Parent_id.value_counts()

Parent    13000
Name: Parent_id, dtype: int64

In [22]:
df.to_csv(interim_dir+'catalogic_300424.csv')

In [13]:
#df = df.sort_values(by = ['Parent_id', 'Parent'], ascending = [True, True])

In [56]:
df_sorted = df.sort_values(by='Parent')

In [57]:
df_sorted

Unnamed: 0,#,ID of metabolite,Level of generation,Predecessor ID,CAS,Chemical name,SMILES,BOD_Observed [28.00 days],Observed map/metabolite,Transformation name,...,Transformation type,Transformation probability,Used probability,Quantity of metabolite,Total quantity of metabolite,Number of repetitions,Status,Parent,dtxsid,Parent_id
1408275,13000,8,1,1,,,FC(Cl)=C(F)SCc1ccccc1,,,Dehalogenation|Dehydrohalogenation,...,[phase I],0.00100,0.000958,0.000958,0.000958,1,Low probability,DTXSID001002889,,
1408251,13000,34,4,32,,,OC(=O)CSC(F)(F)C(F)Cl,,,Oxidation|Aldehyde oxidation,...,[phase I],1.00000,1.000000,0.009468,0.025770,2,Exists in the map,DTXSID001002889,,
1408250,13000,32,3,31,,,O=CCSC(F)(F)C(F)Cl,,,Cleavage|C-C bond cleavage|Reductive C-C bond ...,...,[phase I],1.00000,1.000000,0.000000,0.000000,2,Exists in the map,DTXSID001002889,,
1408249,13000,31,2,3,,,OC(=CC=CC(=O)CSC(F)(F)C(F)Cl)C(O)=O,,,Cleavage|Ring cleavage|Aromatic ring cleavage,...,[phase I],1.00000,1.000000,0.000000,0.000000,2,Exists in the map,DTXSID001002889,,
1408248,13000,3,1,1,,,Oc1cccc(CSC(F)(F)C(F)Cl)c1O,,,Hydroxylation|Aromatic ring hydroxylation,...,[phase I],0.03081,0.029520,0.000000,0.000000,2,Exists in the map,DTXSID001002889,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,1,8,1,1,,,FC1C(F)=C(F)C(F)(F)C1(F)F,,,Dehalogenation|Dehydrohalogenation,...,[phase I],0.00100,0.000999,0.000999,0.001997,2,Low probability,DTXSID90998543,,
3,1,6,1,1,,,FC1C(F)=C(F)C(F)(F)C1(F)F,,,Dehalogenation|Dehydrohalogenation,...,[phase I],0.00100,0.000999,0.000999,0.001997,2,Low probability,DTXSID90998543,,
2,1,4,1,1,,,FC1C(F)(F)C(F)=C(F)C1(F)F,,,Dehalogenation|Dehydrohalogenation,...,[phase I],0.00100,0.000999,0.000999,0.001997,2,Low probability,DTXSID90998543,,
1,1,2,1,1,,,FC1C(F)(F)C(F)=C(F)C1(F)F,,,Dehalogenation|Dehydrohalogenation,...,[phase I],0.00100,0.000999,0.000999,0.001997,2,Low probability,DTXSID90998543,,


In [58]:
newlst = []

# Iterate over each group
for i, group in df_sorted.groupby('Parent'):
    subset = group[[ 'Parent', 'Parent_id']]
    # Define a counter for generating unique identifiers
    counter = 1
    # Iterate over rows in the subset
    for index, row in subset.iterrows():
        if row['Parent_id'] == 'Parent':
            newlst.append(row['Parent'])
        else:
            # Append with a unique identifier based on the counter
            newlst.append(row['Parent'] + '_m_' + str(counter))
            # Increment the counter
            counter += 1  

In [59]:
len(newlst)

1408276

In [62]:
df_sorted['ID'] = newlst

In [65]:
df_sorted1 = df_sorted.sort_values(by = 'ID')

In [66]:
tsca_df = pd.read_csv(interim_dir+'april2024_tsca_pfas.csv')

In [68]:
tsca_dtx = tsca_df['dtxsid'].tolist()

In [71]:
tsca_degradation = df_sorted1[df_sorted1['Parent'].isin(tsca_dtx)]

In [72]:
tsca_degradation

Unnamed: 0,#,ID of metabolite,Level of generation,Predecessor ID,CAS,Chemical name,SMILES,BOD_Observed [28.00 days],Observed map/metabolite,Transformation name,...,Transformation probability,Used probability,Quantity of metabolite,Total quantity of metabolite,Number of repetitions,Status,Parent,dtxsid,Parent_id,ID
1356137,12599,1,0,-,213974-85-7,DTXSID001343190,FC(F)(CCOB(OCCC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)...,No data,,,...,,,0.997000,0.997000,1,,DTXSID001343190,DTXSID001343190,Parent,DTXSID001343190
1356138,12599,2,1,1,,,FC(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)CCOB(O...,,,Dehalogenation|Reductive dehalogenation,...,0.0010,0.000999,0.000999,0.002997,3,Low probability,DTXSID001343190,,,DTXSID001343190_m_1
1356140,12599,6,1,1,,,FC(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)CCOB(O...,,,Dehalogenation|Reductive dehalogenation,...,0.0010,0.000999,0.000999,0.002997,3,Low probability,DTXSID001343190,,,DTXSID001343190_m_2
1356139,12599,4,1,1,,,FC(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)CCOB(O...,,,Dehalogenation|Reductive dehalogenation,...,0.0010,0.000999,0.000999,0.002997,3,Low probability,DTXSID001343190,,,DTXSID001343190_m_3
1327200,12437,1,0,-,1250867-58-3,DTXSID001358896,CCCCCCCC[N+]1(.[O-]S(=O)(=O)C(F)(F)C(F)F)=CN(C...,No data,,,...,,,0.000000,0.000000,1,,DTXSID001358896,DTXSID001358896,Parent,DTXSID001358896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,10,17,4,13,,,O[Si](Cl)(Cl)C(F)=C(F)F,,,Dehalogenation|Dehydrohalogenation,...,0.0010,0.000999,0.000025,0.000050,2,Low probability,DTXSID90962360,,,DTXSID90962360_m_5
351,10,5,1,1,,,C[Si](Cl)(Cl)C(F)=C(F)F,,,Dehalogenation|Dehydrohalogenation,...,0.0010,0.000996,0.000996,0.001992,2,Low probability,DTXSID90962360,,,DTXSID90962360_m_6
352,10,7,1,1,,,C[Si](Cl)(Cl)C(F)(F)C(O)(F)F,,,Hydroxylation|Aliphatic hydroxylation,...,0.0010,0.000996,0.000996,0.000996,1,Low probability,DTXSID90962360,,,DTXSID90962360_m_7
353,10,8,1,1,,,C[Si](O)(Cl)Cl,,,Cleavage|C-Si bond cleavage|Hydrolytic C-Si bo...,...,0.0006,0.000598,0.000598,0.000598,1,Low probability,DTXSID90962360,,,DTXSID90962360_m_8


In [73]:
def smi_inchi(x):
    mol = Chem.MolFromSmiles(x)
    clean_smi = Chem.MolToSmiles(mol, isomericSmiles=False)
    clean_mol = Chem.MolFromSmiles(clean_smi)
    inchi =     Chem.MolToInchi(clean_mol)
    inchi_key = Chem.InchiToInchiKey(inchi)
    return inchi_key

In [74]:
inchi_keys = {}
errors = []
for k,v in zip(tsca_degradation['ID'], tsca_degradation['SMILES']):
    try:
        X = smi_inchi(v)
    except Exception as error:
        errors.append(error)
    else:
        inchi_keys[k] = X

[19:14:19] SMILES Parse Error: syntax error while parsing: CCCCCCCC[N+]1(.[O-]S(=O)(=O)C(F)(F)C(F)F)=CN(C)C=C1
[19:14:19] SMILES Parse Error: Failed parsing SMILES 'CCCCCCCC[N+]1(.[O-]S(=O)(=O)C(F)(F)C(F)F)=CN(C)C=C1' for input: 'CCCCCCCC[N+]1(.[O-]S(=O)(=O)C(F)(F)C(F)F)=CN(C)C=C1'






























































































































































































































































































































































































































































































[19:14:20] SMILES Parse Error: syntax error while parsing: O=C(C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F)[O-].[Cr+3](.[O-]C(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)F).[O-]C(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)

In [75]:
df_inchi = pd.DataFrame(inchi_keys.items(), columns = ['ID', 'INCHI_KEY'])

In [77]:
df_inchi.shape

(38683, 2)

In [78]:
tsca_degradation.shape

(39512, 25)

In [80]:
tsca_degradation1 = pd.merge(tsca_degradation, df_inchi, on = 'ID', how = 'left')

In [82]:
tsca_degradation1.shape

(39512, 26)

In [85]:
tsca_degradation2 = tsca_degradation1[tsca_degradation1.INCHI_KEY.notnull()]

In [86]:
pfas_tsca_grpby = tsca_degradation2.groupby('INCHI_KEY')['ID'].apply(lambda x : x.tolist()).reset_index()

In [88]:
pfas_tsca = pd.merge(pfas_tsca_grpby, tsca_degradation2[['INCHI_KEY', 'SMILES']], on = 'INCHI_KEY', how = 'left').drop_duplicates(subset = ['INCHI_KEY'])

In [89]:
pfas_tsca

Unnamed: 0,INCHI_KEY,ID,SMILES
0,AACZGCBUSNLCIK-UHFFFAOYSA-N,"[DTXSID60887450_m_58, DTXSID60887450_m_89, DTX...",OC(=O)CC(=O)C(=CC(O)=O)OS(=O)(=O)C(F)(F)C(F)(F...
3,AAIAEOPDGOKSRA-UHFFFAOYSA-N,"[DTXSID40883016_m_22, DTXSID40883016_m_28, DTX...",CCCCP(O)(=O)(CCCC).[N-](C)S(=O)(=O)C(F)(F)C(F)...
7,AALULFMCJUWILT-UHFFFAOYSA-N,"[DTXSID30880554_m_136, DTXSID4070378_m_150]",CCN(CO)S(=O)(=O)CCC(F)(F)C(F)(F)C(F)(F)C(F)(F)...
9,AAOIVJJZEDSKPA-UHFFFAOYSA-N,"[DTXSID0072772_m_6, DTXSID00881275_m_5, DTXSID...",CNCCC=O
23,AAPLIUHOKVUFCC-UHFFFAOYSA-N,[DTXSID90880116_m_19],C[Si](C)(C)O
...,...,...,...
38674,ZZKOWTIHKUWUOI-UHFFFAOYSA-N,[DTXSID8067365_m_7],OC(F)(F)C(F)(F)Oc1ccc(N(=O)=O)cc1
38675,ZZQYDYODFHABLC-UHFFFAOYSA-N,[DTXSID80883353_m_1],FC1(Cl)CCC1(F)F
38676,ZZWKTJZMSCLBFA-UHFFFAOYSA-N,"[DTXSID90886557_m_120, DTXSID90886557_m_130]",CC(C)(C(C(C(=O)C(F)(F)C(F)(F)C(F)(F)F)C(O)=O)C...
38678,ZZXFGOSSISMSPC-UHFFFAOYSA-N,"[DTXSID60865157_m_277, DTXSID60865157_m_278, D...",Oc1c(F)c(F)c(F)c(OC(F)(C(=O)F)C(F)(F)F)c1F


In [90]:
import re

In [91]:
pfas_dict = {k:Chem.MolFromSmiles(v) for k,v in zip(pfas_tsca['INCHI_KEY'], pfas_tsca['SMILES'])}

In [92]:
def chain_length(mol, ch=30):
    mysr = 'C(F)(F)'
    mylst = []
    for n in range(1, ch):
        a = mol.HasSubstructMatch(Chem.MolFromSmarts(''.join(mysr * n)))
        mylst.append(a)
    return mylst.index(False)

In [93]:
errors = []
mydict = {}
for k,v in pfas_dict.items():
    try:
        mydict[k] = chain_length(v)
    except Exception as error:
        errors.append(k)

In [95]:
len(mydict)

4245

In [96]:
ch_df = pd.DataFrame(list(mydict.items()), columns = ['INCHI_KEY', 'chain_length'])

In [97]:
ch_df

Unnamed: 0,INCHI_KEY,chain_length
0,AACZGCBUSNLCIK-UHFFFAOYSA-N,5
1,AAIAEOPDGOKSRA-UHFFFAOYSA-N,4
2,AALULFMCJUWILT-UHFFFAOYSA-N,6
3,AAOIVJJZEDSKPA-UHFFFAOYSA-N,0
4,AAPLIUHOKVUFCC-UHFFFAOYSA-N,0
...,...,...
4240,ZZKOWTIHKUWUOI-UHFFFAOYSA-N,2
4241,ZZQYDYODFHABLC-UHFFFAOYSA-N,1
4242,ZZWKTJZMSCLBFA-UHFFFAOYSA-N,3
4243,ZZXFGOSSISMSPC-UHFFFAOYSA-N,1


In [98]:
pfas_tsca1 = pd.merge(pfas_tsca, ch_df, on = 'INCHI_KEY', how = 'left')

In [99]:
pfas_tsca2 = pfas_tsca1[~((pfas_tsca1['chain_length'].isnull()) | (pfas_tsca1['chain_length'] ==0) ) ]

In [101]:
pfas_tsca2.to_csv(external_dir+'pfas_tsca_degradation.csv')

In [111]:
pfas_tsca2.columns

Index(['INCHI_KEY', 'ID', 'SMILES', 'chain_length'], dtype='object')

In [112]:
pfas_tsca2.columns = ['dtxsid', 'ID', 'SMILES', 'chain_length']

In [115]:
pfas_tsca2.shape

(3388, 4)

In [103]:
atlas_deg = pd.read_csv(external_dir+'new_inv_pfas_tsca_deg_atlas_020524.csv')

In [107]:
atlas_deg = atlas_deg[['Molecule name','First_Class', 'Second_Class', 'Canonical_QSARr']]

In [110]:
atlas_deg.columns = ['dtxsid', 'First_Class', 'Second_Class', 'QSAR_READY_SMILES']

In [116]:
pfas_tsca3 = pd.merge(pfas_tsca2, atlas_deg, on = 'dtxsid', how = 'left')

In [117]:
opera_df = pd.read_csv(processed_dir+'pfas_april2024_tsca_degradation-sdf_OPERA2.9Pred.csv')

In [119]:
opera_df.rename(columns = {'MoleculeID': 'dtxsid'}, inplace = True)

In [120]:
opera_df.shape

(3220, 150)

In [123]:
pfas_tsca4 = pd.merge(pfas_tsca3, opera_df, on = 'dtxsid', how = 'left')

In [155]:
chk = []
chk2 = []
for i, row in pfas_tsca4.iterrows():
    if len(row['ID']) == 1 :
        if '_m_' not  in row['ID'][0]:
            chk.append(row['ID'])
    elif len(row['ID']) >1 and '_m_' not in row['ID'][0]:
        chk2.append(row['ID'])
   

Removing metabolites that are actually just parents as well as degradates that match with existing parents. That would remove 461 from the list>

In [150]:
len(chk)

426

In [156]:
len(chk2)

35

In [161]:
426+35

461

In [163]:
pfas_tsca5 = pfas_tsca4[~pfas_tsca4['ID'].isin(chk+chk2)]

In [164]:
pfas_tsca5.shape

(2927, 156)

In [168]:
pfas_tsca5.rename(columns = {'SMILES': 'smiles'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pfas_tsca5.rename(columns = {'SMILES': 'smiles'}, inplace = True)


So there are 2927 unique degradates that originate from the TSCA list that don't include the TSCA parents.

In [169]:
pfas_tsca5.to_csv(processed_dir+'pfas_tsca_degs_wcats.csv')