#### Considering additional chemicals beyond the centroids 

Created by: Grace Patlewicz <br>
Last modified: 06 Nov  2023<br

This notebook captures several threads but the main focus early on captures the manor in which structurally diverse substances are selected from the overall PFAS Landscape using the MaxMin approach as implemented in RDKit. A couple of bespoke functions were created to more systematically apply the approach to all terminal categories that comprised more than 5 members.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage

#Show mols in dataframes
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage
from IPython.core.display import HTML
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
import os
from scipy.spatial.distance import pdist, squareform
import sys

In [2]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
figures_dir = TOP + 'reports/figures/'

In [3]:
LIB = TOP+'src/models/'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

In [4]:
from model_functions import *

In [5]:
df = pd.read_excel(interim_dir+'mediod_new_universe_061123.xlsx', index_col = [0])

In [6]:
df.head()

Unnamed: 0,dtxsid,DSSTox_QC-Level,Substance_Name,Substance_CASRN,Substance_Type,smiles,Structure_InChI,Structure_InChIKey,Structure_Formula,Structure_MolWt,...,prelim_final_centroid,prelim_final_dist,level3_cluster_centroid,lvl_cluster_d,lvl3_centroid,lvl3_centroid2,lvl3_analogues,lvl3_Dist,final_centroid,final_dist
0,DTXSID6071908,DSSTox_High,N-Ethyl-N-((pentadecafluoroheptyl)sulphonyl)gl...,68957-63-1,Single Compound,CCN(CC(O)=O)S(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)...,"InChI=1S/C11H8F15NO4S/c1-2-27(3-4(28)29)32(30,...",IOQHTIQMCISFKB-UHFFFAOYSA-N,C11H8F15NO4S,535.22,...,DTXSID6071908,0.0,,,,,,,DTXSID6071908,0.0
1,DTXSID8071358,DSSTox_High,N-Methyl-perfluoro-1-heptanesulfonamide,68259-14-3,Single Compound,CNS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)...,"InChI=1S/C8H4F15NO2S/c1-24-27(25,26)8(22,23)6(...",KDHCALLFPWZTPN-UHFFFAOYSA-N,C8H4F15NO2S,463.16,...,DTXSID6071908,0.64,,,,,,,DTXSID6071908,0.64
2,DTXSID1071907,DSSTox_High,N-Ethylpentadecafluoro-1-heptanesulfonamide,68957-62-0,Single Compound,CCNS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F...,"InChI=1S/C9H6F15NO2S/c1-2-25-28(26,27)9(23,24)...",WMOMXEHEPXLIAV-UHFFFAOYSA-N,C9H6F15NO2S,477.19,...,DTXSID6071908,0.615385,,,,,,,DTXSID6071908,0.615385
3,DTXSID001026645,DSSTox_Low,"1,1,2,2,3,3,4,4,5,5,6,6,7,7,7-Pentadecafluoro-...",167398-54-1,Single Compound,OCCNS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(...,"InChI=1S/C9H6F15NO3S/c10-3(11,4(12,13)6(16,17)...",LJJUSVBJKZXMCP-UHFFFAOYSA-N,C9H6F15NO3S,493.19,...,DTXSID6071908,0.660714,,,,,,,DTXSID6071908,0.660714
4,DTXSID301026644,DSSTox_Low,(Perfluoroheptyl)(sulfonylamino)acetic acid,1003194-00-0,Single Compound,OC(=O)CNS(=O)(=O)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C...,"InChI=1S/C9H4F15NO4S/c10-3(11,4(12,13)6(16,17)...",LAAXCIZXZCKWFF-UHFFFAOYSA-N,C9H4F15NO4S,507.17,...,DTXSID6071908,0.592593,,,,,,,DTXSID6071908,0.592593


In [7]:
df.columns

Index(['dtxsid', 'DSSTox_QC-Level', 'Substance_Name', 'Substance_CASRN',
       'Substance_Type', 'smiles', 'Structure_InChI', 'Structure_InChIKey',
       'Structure_Formula', 'Structure_MolWt', 'Structure_SMILES_2D-QSAR',
       'Canonical_QSARr', 'TSCA_STATUS', 'chain_length', 'category',
       'MolWeight', 'LogP_pred', 'LogP_predRange', 'AD_LogP', 'AD_index_LogP',
       'Conf_index_LogP', 'MP_pred', 'MP_predRange', 'AD_MP', 'AD_index_MP',
       'Conf_index_MP', 'BP_pred', 'BP_predRange', 'AD_BP', 'AD_index_BP',
       'Conf_index_BP', 'LogVP_pred', 'VP_predRange', 'AD_VP', 'AD_index_VP',
       'Conf_index_VP', 'LogWS_pred', 'WS_predRange', 'AD_WS', 'AD_index_WS',
       'Conf_index_WS', 'LogHL_pred', 'HL_predRange', 'AD_HL', 'AD_index_HL',
       'Conf_index_HL', 'ID', 'tsca_match', 'subcategory', 'cluster',
       'cluster_centroid', 'centroid_x', 'centroid2', 'centroid_y',
       'analogues', 'Dist', 'level2_cluster_centroid', 'cluster_d',
       'lvl2_centroid', 'lvl2_centro

In [8]:
df.final_centroid.nunique()

90

In [9]:
df.final_centroid.unique().tolist()[0:5]

['DTXSID6071908',
 'DTXSID0071889',
 'DTXSID50387373',
 'DTXSID20440585',
 'DTXSID90558000']

In [10]:
df['group'] = df[['category','subcategory', 'cluster_d',  'lvl_cluster_d']].apply(tuple, axis = 1)
df['group_str']=[str(e) for e in df['group'] ]

In [12]:
#df[df['final_centroid'] == 'DTXSID6071908']['group_str']

In [13]:
cats = df['group_str'].unique().tolist()

In [14]:
def content_max(df,  k = None):
    content = {}
    tdf = df[df['group_str'] == k]
    content['t_dict'] = {k:Chem.MolFromSmiles(v) for k,v in zip(tdf['dtxsid'], tdf['smiles'])}
    content['fp_dict'] = [AllChem.GetMorganFingerprintAsBitVect(i, 3, 1024) for i in content['t_dict'].values()]
    content['nfp_dict'] = len(content['fp_dict'])
    content['t_index'] = {k:v for k,v in enumerate(content['t_dict'].keys())} 
    content['k'] = k
    
    return content

In [15]:
lte_summaries = []
for k in cats:
    try:
        lte_summaries.append(content_max(df, k))
    except Exception as error:
        print(k)



In [16]:
lte_summaries[0].keys()

dict_keys(['t_dict', 'fp_dict', 'nfp_dict', 't_index', 'k'])

In [17]:
from rdkit.SimDivFilters import MaxMinPicker

In [18]:
def picker_func(fps, n1, start_with  = 1, n2 = 3, label = None, fps_index = None):
    picker_dict = {}
    mmp = MaxMinPicker()
    picker_dict['label'] = label
    picker_dict['indices'] = mmp.LazyBitVectorPick(fps,n1, start_with+n2, [0])
    picker_dict['diverse_chems'] = [fps_index[x] for x in picker_dict['indices']]
    return picker_dict

In [19]:
picker_lst = []
for i in range(len(lte_summaries)):
    if lte_summaries[i]['nfp_dict'] > 5:
        picker_lst.append(picker_func(lte_summaries[i]['fp_dict'], n1 =lte_summaries[i]['nfp_dict'], n2 = 3, label = lte_summaries[i]['k'], fps_index = lte_summaries[i]['t_index']  ))


In [20]:
mylst2 = []
for i in range(len(picker_lst)):
    mylst2.append(picker_lst[i]['diverse_chems'])


In [21]:
mylst2 = [e for a in mylst2 for e in a]

In [22]:
len(mylst2)

344

In [23]:
picker_lst[0]

{'label': "('FASA based PFAA precursors', 'gte7', nan, nan)",
 'indices': <rdkit.rdBase._vecti at 0x7f4c2a79b4b0>,
 'diverse_chems': ['DTXSID6071908',
  'DTXSID40226388',
  'DTXSID1071080',
  'DTXSID901348359']}

In [24]:
#picker_lst

Which categories had a MaxMin approach applied?

In [24]:
pick_cats = [picker_lst[i]['label'] for i in range(len(picker_lst))]

In [25]:
len(pick_cats)

86

Which did not and how large were they?

In [26]:
[e for e in cats if e not in pick_cats]

["('PFAA precursors', 'lt7', 1.0, 2.0)",
 "('PFAA precursors', 'lt7', 1.0, 3.0)",
 "('Unable to open ring(s)', 'lt7', nan, nan)",
 "('unclassified', 'gte7', 3.0, 1.0)"]

In [27]:
cat_dict = {k:len(v) for k,v in df.groupby('group_str')}

In [28]:
{e:cat_dict[e] for e in cat_dict.keys() if e in [e for e in cats if e not in pick_cats]}

{"('PFAA precursors', 'lt7', 1.0, 2.0)": 4,
 "('PFAA precursors', 'lt7', 1.0, 3.0)": 5,
 "('Unable to open ring(s)', 'lt7', nan, nan)": 2,
 "('unclassified', 'gte7', 3.0, 1.0)": 5}

In [29]:
df['MaxMin_picks'] = df['dtxsid'].apply(lambda x: 1 if x in mylst2 else 0)

In [30]:
df.MaxMin_picks.value_counts()

0    15070
1      344
Name: MaxMin_picks, dtype: int64

In [31]:
df.groupby('group_str').agg({'MaxMin_picks' : 'sum'})

Unnamed: 0_level_0,MaxMin_picks
group_str,Unnamed: 1_level_1
"('FASA based PFAA precursors', 'gte7', nan, nan)",4
"('FASA based PFAA precursors', 'lt7', nan, nan)",4
"('Fluorotelomer PFAA precursors', 'gte7', nan, nan)",4
"('Fluorotelomer PFAA precursors', 'lt7', 1.0, nan)",4
"('Fluorotelomer PFAA precursors', 'lt7', 2.0, 1.0)",4
...,...
"('unclassified', 'lt7', 2.0, 3.0)",4
"('unclassified', 'lt7', 3.0, 1.0)",4
"('unclassified', 'lt7', 3.0, 2.0)",4
"('unclassified', 'lt7', 3.0, 3.0)",4


In [32]:
writer = pd.ExcelWriter(interim_dir+'final_universe_wMaxMin_061123.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.

df.to_excel(writer)


writer.save()

In [33]:
df.MaxMin_picks.value_counts()

0    15070
1      344
Name: MaxMin_picks, dtype: int64