## Are there specific features that are overrepresented in the Others and unclassified primary categories.

Prepared by: Grace Patlewicz <br>
Created on: 6th November 2023 <br>
Last modified on: 10th May 2024
Investigating whether there are any specific features that ought to be teased out for the very broad Aromatics and unclassified categories. Note these broad categories pertain to the PFAS-Atlas categorisations performed on the updated April 2024 inventory.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage

#Show mols in dataframes
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage
from IPython.core.display import HTML
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
import os
from scipy.spatial.distance import pdist, squareform


In [2]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
processed_dir = TOP + 'data/processed/'
figures_dir = TOP + 'reports/figures/'

In [3]:
df = pd.read_excel(interim_dir+'mediod_new_universe_090524.xlsx' ,  index_col = [0])

In [4]:
pfas_txps = pd.read_csv(interim_dir+'output_new_pfas__PFASTxP_april.txt', sep = ';')
pfas_txps = pfas_txps[pfas_txps['M_CORINA_SYMPHONY_ERRORS_[STRING]'] == 'No errors']
pfas_txps = pfas_txps.iloc[:,:-2]

In [5]:
pfas_txps.shape

(15602, 130)

In [6]:
pfas_txps

Unnamed: 0,M_NAME,pfas_atom:element_metal_metalloid_CF,pfas_bond:aromatic_FCc1c,pfas_bond:C#N_nitrile_generic_CF,pfas_bond:C(=O)N_carboxamide_(NHR)_C(=O)CF,pfas_bond:C(=O)N_carboxamide_(NR2)_C(=O)CF,pfas_bond:C(=O)N_carboxamide_generic_C(=O)CF,pfas_bond:C(=O)O_carboxylicAcid_alkenyl_CF,pfas_bond:C(=O)O_carboxylicAcid_generic_CF,pfas_bond:C(=O)O_carboxylicEster_acyclic_C(=O)CF,...,pfas_chain:polyF_nocap_CFCHFCF,pfas_ring:aromatic_benzene_CF,pfas_ring:fused_[5_6]_indane_F,pfas_ring:generic_CF,pfas_ring:hetero_[3]_O_epoxide_CF,pfas_ring:hetero_[3]_Z_generic_F,pfas_ring:hetero_[5]_Z_1-Z_CF,pfas_ring:hetero_[5]_Z_1-Z_F,pfas_ring:hetero_[6]_Z_1-_CF,pfas_ring:hetero_[6]_Z_1-_F
0,DTXSID90998543,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
1,DTXSID90996549,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,DTXSID90991670,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,DTXSID90990360,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,DTXSID90987417,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15607,ZYYKXQFFIHPGPH-UHFFFAOYSA-N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15608,ZZKOWTIHKUWUOI-UHFFFAOYSA-N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15609,ZZWKTJZMSCLBFA-UHFFFAOYSA-N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15610,ZZXFGOSSISMSPC-UHFFFAOYSA-N,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [7]:
pfas_txps_wcats = pd.merge(df[['dtxsid', 'category']], pfas_txps, left_on = 'dtxsid', right_on = 'M_NAME', how = 'left')

In [8]:
pfas_txps_wcats.drop(['M_NAME'], axis = 1, inplace = True)

In [9]:
pfas_txps_wcats.shape

(15525, 131)

In [10]:
pfas_txps_wcats['pfas_atom:element_metal_metalloid_CF'].isnull().sum()

0

In [11]:
pfas_txps_wcats.category.nunique()

18

In [12]:
pfas_txps_wcats1 = pfas_txps_wcats.set_index('dtxsid')


In [13]:
df2 = pd.concat([pd.get_dummies(pfas_txps_wcats1['category']), pfas_txps_wcats1], axis = 1)

In [14]:
#df2.columns.tolist()

In [15]:
df2.drop(['category'], axis = 1, inplace = True)

In [19]:
df2.columns.tolist().index

df2.columns.tolist().index("unclassified")

17

In [26]:
#df2.iloc[:,18:]

In [32]:
import scipy.stats as stats

In [33]:
def create_enrich(df):
    res = []
    for e in df.columns[:18]:
        for i, column in df.iloc[:,18:].items():
            #print(i,column)
            if pd.crosstab(df.loc[:,e], column).shape == (2,2):
                a = pd.crosstab(df[e],column)
                #print(a)
                tn, fp, fn, tp = a.iloc[0,0], a.iloc[0,1], a.iloc[1,0], a.iloc[1,1]
                tested_active = (tp + fp)
                all_tested = tp+fn+tn+fp
                oddsratio, pvalue = stats.fisher_exact(a,alternative='greater')
                res.append([e, i, oddsratio, pvalue, tp, fn, fp, tn, tested_active, all_tested])
    return res

In [43]:
pd.crosstab(df2['Aromatic PFASs'], df2['pfas_ring:aromatic_benzene_CF'])

pfas_ring:aromatic_benzene_CF,0,1
Aromatic PFASs,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11504,7
1,3373,641


In [35]:
def convert(lst):
    df = pd.DataFrame(lst)
    df.rename(columns = {0: 'Category', 1: 'TxP', 2: 'Odds Ratio', 3 : 'P-Value', 4: 'TP', 5: 'FN', 6: 'FP', 7: 'TN', 8: 'Tested Active', 9: 'All Tested'}, inplace = True)
    df = df.sort_values(by=['Odds Ratio','P-Value'], ascending = [True, False])
    return df

In [36]:
def final_df(df):
    return df[(df['Odds Ratio'] >= 3) & (df['P-Value'] < 0.05 ) & (df['TP'] >=3)]

In [37]:
txp_enrich = create_enrich(df2)

In [39]:
txp_enrich = convert(txp_enrich)

In [41]:
txp_enrich[txp_enrich['Category'] == 'Aromatic PFASs'].sort_values(by=['Odds Ratio','P-Value'], ascending = [False, False])

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested
121,Aromatic PFASs,pfas_ring:fused_[5_6]_indane_F,inf,4.641993e-26,43,3971,0,11511,43,15525
1,Aromatic PFASs,pfas_bond:aromatic_FCc1c,499.563822,0.000000e+00,1035,2979,8,11503,1043,15525
120,Aromatic PFASs,pfas_ring:aromatic_benzene_CF,312.314769,0.000000e+00,641,3373,7,11504,648,15525
47,Aromatic PFASs,pfas_bond:S(=O)O_sulfonicEster_acyclic_S-C_(ch...,9.418550,8.231541e-84,257,3757,83,11428,340,15525
127,Aromatic PFASs,pfas_ring:hetero_[6]_Z_1-_CF,7.207814,8.244886e-19,62,3952,25,11486,87,15525
...,...,...,...,...,...,...,...,...,...,...
44,Aromatic PFASs,pfas_bond:quatN_alkyl_acyclic_CF,0.000000,1.000000e+00,0,4014,57,11454,57,15525
68,Aromatic PFASs,pfas_chain:alkyne_ethyne_generic_F,0.000000,1.000000e+00,0,4014,2,11509,2,15525
81,Aromatic PFASs,pfas_chain:FT_n2_OP,0.000000,1.000000e+00,0,4014,108,11403,108,15525
84,Aromatic PFASs,pfas_chain:FT_n2_X,0.000000,1.000000e+00,0,4014,48,11463,48,15525


In [44]:
txp_enrich1 = final_df(txp_enrich)
txp_enrich1

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested
2203,unclassified,pfas_bond:C~Z_CF2CF2-Z,3.013647e+00,3.466725e-30,258,191,4666,10410,4924,15525
925,PFAAs,pfas_bond:COC_diether_FCOC(F)C(F)C(F)OC,3.030862e+00,2.152663e-02,6,852,34,14633,40,15525
790,"PFAA precursors, cyclic",pfas_bond:CC(=O)C_ketone_alkene_generic_CF,3.065224e+00,3.038693e-02,5,519,47,14954,52,15525
791,"PFAA precursors, cyclic",pfas_bond:CC(=O)C_ketone_generic_CF,3.074132e+00,6.769889e-11,52,472,519,14482,571,15525
1230,PolyFCA derivatives,pfas_chain:FT_n1_C=O,3.075636e+00,2.843564e-05,22,1377,73,14053,95,15525
...,...,...,...,...,...,...,...,...,...,...
565,PASF-based substances,pfas_bond:S~N_generic_CF,1.050496e+03,0.000000e+00,1432,35,527,13531,1959,15525
526,PASF-based substances,pfas_bond:C~Z_CF2CF2-Z,1.494747e+03,0.000000e+00,1464,3,3460,10598,4924,15525
1760,Si PFASs,pfas_chain:FT_n2_Si,1.555694e+03,0.000000e+00,230,186,12,15097,242,15525
566,PASF-based substances,pfas_bond:S=O_sulfonyl_generic_CF,1.106228e+04,0.000000e+00,1466,1,1645,12413,3111,15525


In [45]:
txp_enrich1[txp_enrich1['Category'] == 'unclassified'].sort_values(by=['Odds Ratio','P-Value'], ascending = [False, False])

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested
2239,unclassified,pfas_bond:S(=O)O_sulfonicAcid_acyclic_(chain)_SCF,10.810426,2.534492e-60,106,343,419,14657,525,15525
2193,unclassified,pfas_atom:element_metal_metalloid_CF,6.993836,2.838075e-07,13,436,64,15012,77,15525
2266,unclassified,pfas_chain:FT_n1_OP,5.668929,0.0004851772,7,442,42,15034,49,15525
2243,unclassified,pfas_bond:S=O_sulfonyl_generic_CF,4.322675,1.2864139999999999e-48,227,222,2884,12192,3111,15525
2274,unclassified,pfas_chain:FT_n2_OP,4.284897,6.687069e-05,12,437,96,14980,108,15525
2208,unclassified,pfas_bond:CC(=O)C_dione_(1_3-)_CF,3.758477,6.421885e-05,14,435,128,14948,142,15525
2297,unclassified,pfas_chain:perF-linear_cap_C6_excl_mod,3.328462,6.045905e-21,104,345,1252,13824,1356,15525
2203,unclassified,pfas_bond:C~Z_CF2CF2-Z,3.013647,3.466725e-30,258,191,4666,10410,4924,15525


In [46]:
txp_enrich1[txp_enrich1['Category'] == 'Aromatic PFAAs'].sort_values(by=['Odds Ratio','P-Value'], ascending = [False, False])

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested


In [48]:

txp_enrich1[txp_enrich1['Category'] == 'PFAA precursors'].sort_values(by=['Odds Ratio','P-Value'], ascending = [False, False])

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested
675,PFAA precursors,pfas_bond:COH_alcohol_pri-alkyl_CF,30.204442,3.069639e-107,146,1327,51,14001,197,15525
710,PFAA precursors,pfas_chain:alkeneLinear_mono-ene_ethylene_gene...,9.170233,4.0095100000000006e-159,366,1107,489,13563,855,15525
674,PFAA precursors,pfas_bond:COH_alcohol_generic_OCCF,6.38701,2.5546319999999997e-77,219,1254,374,13678,593,15525
662,PFAA precursors,pfas_bond:CC(=O)C_ketone_generic_CF,6.148586,2.965218e-71,207,1266,364,13688,571,15525
660,PFAA precursors,pfas_bond:CC(=O)C_dione_(1_3-)_CF,5.855665,2.2208589999999998e-19,53,1420,89,13963,142,15525
661,PFAA precursors,pfas_bond:CC(=O)C_ketone_alkene_generic_CF,5.551269,1.176965e-07,19,1454,33,14019,52,15525
657,PFAA precursors,pfas_bond:C=O_acyl_halide_CF,5.107797,2.5834860000000002e-23,73,1400,142,13910,215,15525
711,PFAA precursors,pfas_chain:alkeneLinear_mono-ene_ethylene_gene...,4.299711,2.191749e-51,206,1267,512,13540,718,15525
677,PFAA precursors,pfas_bond:COH_alcohol_ter-alkyl_OC(CF)(CF)C,3.827403,0.01124052,6,1467,15,14037,21,15525
672,PFAA precursors,pfas_bond:COC_ether_alkenyl_OCF,3.731432,1.744419e-10,41,1432,107,13945,148,15525


In [47]:
txp_enrich1.to_csv(external_dir+'PFAS_TxP_enrichments_primary_categories_100524.csv')