#### Considering additional chemicals beyond the centroids 

Created by: Grace Patlewicz <br>
Last modified: 10th May 2024<br>
Change: Updated the MaxMin calculations for the April 2024 inventory.

This notebook captures several threads but the main focus early on captures the manor in which structurally diverse substances are selected from the overall PFAS Landscape using the MaxMin approach as implemented in RDKit. A couple of bespoke functions were created to more systematically apply the approach to all terminal categories that comprised more than 5 members.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage

#Show mols in dataframes
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage
from IPython.core.display import HTML
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
import os
from scipy.spatial.distance import pdist, squareform
import sys

In [2]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
figures_dir = TOP + 'reports/figures/'

In [3]:
LIB = TOP+'src/models/'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

In [4]:
from model_functions import *

In [5]:
df = pd.read_excel(interim_dir+'mediod_new_universe_090524.xlsx', index_col = [0])

In [6]:
df.head()

Unnamed: 0,dtxsid,DSSTox_QC-Level,Substance_Name,Substance_CASRN,Substance_Type,Substance_Note,smiles,Structure_InChI,Structure_InChIKey,Structure_Formula,...,level3_cluster_centroid,lvl_cluster_d,lvl3_centroid,lvl3_centroid2,lvl3_analogues,lvl3_Dist,final_centroid,final_dist,group,group_str
0,IROQAHVXXUQBOS-UHFFFAOYSA-N,,,,,,CCCN(CCNC(=O)c1ccc(Cc2ccc(C(O)=O)cc2)cc1)S(=O)...,,,,...,,,,,,,IROQAHVXXUQBOS-UHFFFAOYSA-N,0.0,"('Aromatic PFASs', 'gte7', nan, nan)","('Aromatic PFASs', 'gte7', nan, nan)"
1,DTXSID90897582,DSSTox_High,4-(Perfluorononyl)oxybenzenesulfonate sodium,91998-13-9,Single Compound,,[Na+].[O-]S(=O)(=O)C1=CC=C(OC(F)(F)C(F)(F)C(F)...,"InChI=1S/C15H5F19O4S.Na/c16-7(17,8(18,19)10(22...",WEKFOCTWTWPVEG-UHFFFAOYSA-M,C15H4F19NaO4S,...,,,,,,,IROQAHVXXUQBOS-UHFFFAOYSA-N,0.788889,"('Aromatic PFASs', 'gte7', nan, nan)","('Aromatic PFASs', 'gte7', nan, nan)"
2,DTXSID90896257,Public_Low,(2S)-2-{Bis[4-(heptadecafluorooctyl)phenyl][(t...,914391-37-0,Single Compound,,CC[Si](CC)(CC)OC([C@@H]1CCCN1)(C1=CC=C(C=C1)C(...,"InChI=1S/C39H31F34NOSi/c1-4-76(5-2,6-3)75-23(2...",PARYPNCCAOUHML-QFIPXVFZSA-N,C39H31F34NOSi,...,,,,,,,IROQAHVXXUQBOS-UHFFFAOYSA-N,0.814815,"('Aromatic PFASs', 'gte7', nan, nan)","('Aromatic PFASs', 'gte7', nan, nan)"
3,DTXSID90896196,Public_Low,"2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,9-Heptadecaflu...",157584-47-9,Single Compound,,FC(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F...,"InChI=1S/C21H10F17NO/c22-14(23,13(40)39(11-7-3...",IDJHICNQDCLWAB-UHFFFAOYSA-N,C21H10F17NO,...,,,,,,,IROQAHVXXUQBOS-UHFFFAOYSA-N,0.788889,"('Aromatic PFASs', 'gte7', nan, nan)","('Aromatic PFASs', 'gte7', nan, nan)"
4,DTXSID90896095,Public_Low,"3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,10-Heptadeca...",113823-56-6,Single Compound,,CC1=CC=C(C=C1)S(=O)(=O)OCCC(F)(F)C(F)(F)C(F)(F...,"InChI=1S/C17H11F17O3S/c1-8-2-4-9(5-3-8)38(35,3...",WGCSEECKLMOBLT-UHFFFAOYSA-N,C17H11F17O3S,...,,,,,,,IROQAHVXXUQBOS-UHFFFAOYSA-N,0.791667,"('Aromatic PFASs', 'gte7', nan, nan)","('Aromatic PFASs', 'gte7', nan, nan)"


In [7]:
df.columns

Index(['dtxsid', 'DSSTox_QC-Level', 'Substance_Name', 'Substance_CASRN',
       'Substance_Type', 'Substance_Note', 'smiles', 'Structure_InChI',
       'Structure_InChIKey', 'Structure_Formula',
       ...
       'level3_cluster_centroid', 'lvl_cluster_d', 'lvl3_centroid',
       'lvl3_centroid2', 'lvl3_analogues', 'lvl3_Dist', 'final_centroid',
       'final_dist', 'group', 'group_str'],
      dtype='object', length=194)

In [8]:
df.final_centroid.nunique()

128

In [9]:
df.final_centroid.unique().tolist()[0:5]

['IROQAHVXXUQBOS-UHFFFAOYSA-N',
 'DTXSID80382098',
 'DTXSID50895206',
 'DTXSID60448262',
 'SMKKJMHYQXYWND-UHFFFAOYSA-N']

In [10]:
#df['group'] = df[['category','subcategory', 'cluster_d',  'lvl_cluster_d']].apply(tuple, axis = 1)
#df['group_str']=[str(e) for e in df['group'] ]

In [11]:
#df[df['final_centroid'] == 'DTXSID6071908']['group_str']

In [12]:
cats = df['group_str'].unique().tolist()

In [13]:
def content_max(df,  k = None):
    content = {}
    tdf = df[df['group_str'] == k]
    content['t_dict'] = {k:Chem.MolFromSmiles(v) for k,v in zip(tdf['dtxsid'], tdf['smiles'])}
    content['fp_dict'] = [AllChem.GetMorganFingerprintAsBitVect(i, 3, 1024) for i in content['t_dict'].values()]
    content['nfp_dict'] = len(content['fp_dict'])
    content['t_index'] = {k:v for k,v in enumerate(content['t_dict'].keys())} 
    content['k'] = k
    
    return content

In [14]:
lte_summaries = []
for k in cats:
    try:
        lte_summaries.append(content_max(df, k))
    except Exception as error:
        print(k)

In [15]:
lte_summaries[0].keys()

dict_keys(['t_dict', 'fp_dict', 'nfp_dict', 't_index', 'k'])

In [16]:
from rdkit.SimDivFilters import MaxMinPicker

In [17]:
def picker_func(fps, n1, start_with  = 1, n2 = 3, label = None, fps_index = None):
    picker_dict = {}
    mmp = MaxMinPicker()
    picker_dict['label'] = label
    picker_dict['indices'] = mmp.LazyBitVectorPick(fps,n1, start_with+n2, [0])
    picker_dict['diverse_chems'] = [fps_index[x] for x in picker_dict['indices']]
    return picker_dict

In [18]:
picker_lst = []
for i in range(len(lte_summaries)):
    if lte_summaries[i]['nfp_dict'] > 5:
        picker_lst.append(picker_func(lte_summaries[i]['fp_dict'], n1 =lte_summaries[i]['nfp_dict'], n2 = 3, label = lte_summaries[i]['k'], fps_index = lte_summaries[i]['t_index']  ))


In [19]:
mylst2 = []
for i in range(len(picker_lst)):
    mylst2.append(picker_lst[i]['diverse_chems'])


In [20]:
mylst2 = [e for a in mylst2 for e in a]

In [21]:
len(mylst2)

484

In [22]:
picker_lst[0]

{'label': "('Aromatic PFASs', 'gte7', nan, nan)",
 'indices': <rdkit.rdBase._vecti at 0x767a6ad35570>,
 'diverse_chems': ['IROQAHVXXUQBOS-UHFFFAOYSA-N',
  'DTXSID701026741',
  'DTXSID30290775',
  'DTXSID301041127']}

In [23]:
#picker_lst

Which categories had a MaxMin approach applied?

In [24]:
pick_cats = [picker_lst[i]['label'] for i in range(len(picker_lst))]

In [25]:
len(pick_cats)

121

Which did not and how large were they?

In [26]:
[e for e in cats if e not in pick_cats]

["('Other PFASs, cyclic', 'gte7', 1.0, nan)",
 "('Other PFASs, cyclic', 'gte7', 2.0, nan)",
 "('Other PFASs, cyclic', 'gte7', 3.0, nan)",
 "('PFAAs, cyclic', 'gte7', nan, nan)",
 "('PFAAs, cyclic', 'lt7', 3.0, 1.0)",
 "('unclassified', 'lt7', 2.0, 1.0)",
 "('unclassified', 'lt7', 2.0, 2.0)"]

In [27]:
cat_dict = {k:len(v) for k,v in df.groupby('group_str')}

In [28]:
{e:cat_dict[e] for e in cat_dict.keys() if e in [e for e in cats if e not in pick_cats]}

{"('Other PFASs, cyclic', 'gte7', 1.0, nan)": 2,
 "('Other PFASs, cyclic', 'gte7', 2.0, nan)": 4,
 "('Other PFASs, cyclic', 'gte7', 3.0, nan)": 2,
 "('PFAAs, cyclic', 'gte7', nan, nan)": 1,
 "('PFAAs, cyclic', 'lt7', 3.0, 1.0)": 4,
 "('unclassified', 'lt7', 2.0, 1.0)": 5,
 "('unclassified', 'lt7', 2.0, 2.0)": 2}

In [29]:
df['MaxMin_picks'] = df['dtxsid'].apply(lambda x: 1 if x in mylst2 else 0)

In [30]:
df.MaxMin_picks.value_counts()

0    15041
1      484
Name: MaxMin_picks, dtype: int64

In [31]:
df.groupby('group_str').agg({'MaxMin_picks' : 'sum'})

Unnamed: 0_level_0,MaxMin_picks
group_str,Unnamed: 1_level_1
"('Aromatic PFASs', 'gte7', nan, nan)",4
"('Aromatic PFASs', 'lt7', 1.0, nan)",4
"('Aromatic PFASs', 'lt7', 2.0, 1.0)",4
"('Aromatic PFASs', 'lt7', 2.0, 2.0)",4
"('Aromatic PFASs', 'lt7', 2.0, 3.0)",4
...,...
"('unclassified', 'lt7', 2.0, 3.0)",4
"('unclassified', 'lt7', 3.0, 1.0)",4
"('unclassified', 'lt7', 3.0, 2.0)",4
"('unclassified', 'lt7', 3.0, 3.0)",4


In [32]:
writer = pd.ExcelWriter(interim_dir+'final_universe_wMaxMin_100524.xlsx', engine='xlsxwriter')

# Convert the dataframe to an XlsxWriter Excel object.

df.to_excel(writer)


writer.save()

In [33]:
df.MaxMin_picks.value_counts()

0    15041
1      484
Name: MaxMin_picks, dtype: int64