## Are there specific features that are overrepresented in the Others and unclassified primary categories.

Prepared by: Grace Patlewicz <br>
Created on: 6th November 2023 <br>
Investigating whether there are any specific features that ought to be teased out for the very broad aliphatic and unclassified categories.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage

#Show mols in dataframes
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage
from IPython.core.display import HTML
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
import os
from scipy.spatial.distance import pdist, squareform


In [2]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
processed_dir = TOP + 'data/processed/'
figures_dir = TOP + 'reports/figures/'

In [3]:
df = pd.read_excel(interim_dir+'secondary_categorisations_new_universe_061123.xlsx' ,  index_col = [0])

In [7]:
pfas_txps = pd.read_csv(interim_dir+'new_inv_4TxPs_PFAS_out.txt', sep = ';')
pfas_txps = pfas_txps[pfas_txps['M_CORINA_SYMPHONY_ERRORS_[STRING]'] == 'No errors']
pfas_txps = pfas_txps.iloc[:,:-2]

In [8]:
pfas_txps.shape

(15415, 130)

In [9]:
pfas_txps

Unnamed: 0,M_NAME,pfas_atom:element_metal_metalloid_CF,pfas_bond:aromatic_FCc1c,pfas_bond:C#N_nitrile_generic_CF,pfas_bond:C(=O)N_carboxamide_(NHR)_C(=O)CF,pfas_bond:C(=O)N_carboxamide_(NR2)_C(=O)CF,pfas_bond:C(=O)N_carboxamide_generic_C(=O)CF,pfas_bond:C(=O)O_carboxylicAcid_alkenyl_CF,pfas_bond:C(=O)O_carboxylicAcid_generic_CF,pfas_bond:C(=O)O_carboxylicEster_acyclic_C(=O)CF,...,pfas_chain:polyF_nocap_CFCHFCF,pfas_ring:aromatic_benzene_CF,pfas_ring:fused_[5_6]_indane_F,pfas_ring:generic_CF,pfas_ring:hetero_[3]_O_epoxide_CF,pfas_ring:hetero_[3]_Z_generic_F,pfas_ring:hetero_[5]_Z_1-Z_CF,pfas_ring:hetero_[5]_Z_1-Z_F,pfas_ring:hetero_[6]_Z_1-_CF,pfas_ring:hetero_[6]_Z_1-_F
0,DTXSID70951853,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,DTXSID60883371,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,DTXSID10445908,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,DTXSID20635191,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,DTXSID00192272,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15410,ZZKOWTIHKUWUOI-UHFFFAOYSA-N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15411,ZZQYDYODFHABLC-UHFFFAOYSA-N,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
15412,ZZWKTJZMSCLBFA-UHFFFAOYSA-N,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15413,ZZXFGOSSISMSPC-UHFFFAOYSA-N,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [10]:
pfas_txps_wcats = pd.merge(df[['dtxsid', 'category']], pfas_txps, left_on = 'dtxsid', right_on = 'M_NAME', how = 'left')

In [12]:
pfas_txps_wcats.drop(['M_NAME'], axis = 1, inplace = True)

In [13]:
pfas_txps_wcats.shape

(15414, 131)

In [14]:
pfas_txps_wcats['pfas_atom:element_metal_metalloid_CF'].isnull().sum()

0

In [15]:
pfas_txps_wcats.category.nunique()

11

In [16]:
pfas_txps_wcats1 = pfas_txps_wcats.set_index('dtxsid')


In [17]:
df2 = pd.concat([pd.get_dummies(pfas_txps_wcats1['category']), pfas_txps_wcats1], axis = 1)

In [21]:
#df2.columns.tolist()

In [22]:
df2.drop(['category'], axis = 1, inplace = True)

In [23]:
import scipy.stats as stats

In [49]:
def create_enrich(df):
    res = []
    for e in df.columns[:11]:
        for i, column in df.iloc[:,11:].items():
            #print(i,column)
            if pd.crosstab(df.loc[:,e], column).shape == (2,2):
                a = pd.crosstab(df[e],column)
                #print(a)
                tn, fp, fn, tp = a.iloc[0,0], a.iloc[0,1], a.iloc[1,0], a.iloc[1,1]
                tested_active = (tp + fp)
                all_tested = tp+fn+tn+fp
                oddsratio, pvalue = stats.fisher_exact(a,alternative='greater')
                res.append([e, i, oddsratio, pvalue, tp, fn, fp, tn, tested_active, all_tested])
    return res

In [50]:
#pd.crosstab(df2['unclassified'], df2['pfas_chain:perF-linear_cap_C11_excl_mod'])

In [51]:
def convert(lst):
    df = pd.DataFrame(lst)
    df.rename(columns = {0: 'Category', 1: 'TxP', 2: 'Odds Ratio', 3 : 'P-Value', 4: 'TP', 5: 'FN', 6: 'FP', 7: 'TN', 8: 'Tested Active', 9: 'All Tested'}, inplace = True)
    df = df.sort_values(by=['Odds Ratio','P-Value'], ascending = [True, False])
    return df

In [52]:
def final_df(df):
    return df[(df['Odds Ratio'] >= 3) & (df['P-Value'] < 0.05 ) & (df['TP'] >=3)]

In [53]:
txp_enrich = create_enrich(df2)

In [54]:
txp_enrich = convert(txp_enrich)

In [55]:
txp_enrich[txp_enrich['Category'] == 'unclassified'].sort_values(by=['Odds Ratio','P-Value'], ascending = [False, False])

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested
1376,unclassified,pfas_chain:FT_n3_N,29.375695,1.140187e-15,17,769,11,14617,28,15414
1363,unclassified,pfas_chain:FT_n1_OP,7.561140,9.128208e-08,14,772,35,14593,49,15414
1336,unclassified,pfas_bond:S(=O)O_sulfonicAcid_acyclic_(chain)_SCF,6.018822,5.810384e-43,115,671,405,14223,520,15414
1332,unclassified,pfas_bond:PC_phosphorus_organo_generic_CF,5.407050,5.046931e-07,16,770,56,14572,72,15414
1388,unclassified,pfas_chain:perF-linear_cap_C11_excl_mod,4.107210,3.077364e-06,18,768,83,14545,101,15414
...,...,...,...,...,...,...,...,...,...,...
1411,unclassified,pfas_ring:fused_[5_6]_indane_F,0.000000,1.000000e+00,0,786,40,14588,40,15414
1413,unclassified,pfas_ring:hetero_[3]_O_epoxide_CF,0.000000,1.000000e+00,0,786,62,14566,62,15414
1415,unclassified,pfas_ring:hetero_[5]_Z_1-Z_CF,0.000000,1.000000e+00,0,786,76,14552,76,15414
1416,unclassified,pfas_ring:hetero_[5]_Z_1-Z_F,0.000000,1.000000e+00,0,786,78,14550,78,15414


In [56]:
txp_enrich1 = final_df(txp_enrich)
txp_enrich1

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested
907,Side-chain aromatics,pfas_bond:C(=O)N_carboxamide_(NR2)_C(=O)CF,3.016558,1.137739e-10,71,3723,73,11547,144,15414
608,PFAA precursors,pfas_chain:perF-branch_isopropyl_F,3.022827,2.261074e-05,22,123,853,14416,875,15414
1389,unclassified,pfas_chain:perF-linear_cap_C12_plus,3.025160,7.393521e-15,78,708,514,14114,592,15414
1399,unclassified,pfas_chain:perF-linear_cap_C9_plus,3.037849,4.417905e-26,152,634,1070,13558,1222,15414
606,PFAA precursors,pfas_chain:perF-branch_isopropyl,3.050931,6.485030e-06,25,120,976,14293,1001,15414
...,...,...,...,...,...,...,...,...,...,...
431,Other aliphatics,pfas_bond:quatN_alkyl_acyclic_CF,inf,2.097602e-15,56,8386,0,6972,56,15414
1024,Side-chain aromatics,pfas_ring:fused_[5_6]_indane_F,inf,3.799070e-25,40,3754,0,11620,40,15414
50,FASA based PFAA precursors,pfas_bond:S=O_sulfonyl_generic_CF,inf,1.384020e-144,204,0,2917,12293,3121,15414
49,FASA based PFAA precursors,pfas_bond:S~N_generic_CF,inf,4.860458e-187,204,0,1768,13442,1972,15414


In [57]:
txp_enrich1[txp_enrich1['Category'] == 'unclassified'].sort_values(by=['Odds Ratio','P-Value'], ascending = [False, False])

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested
1376,unclassified,pfas_chain:FT_n3_N,29.375695,1.140187e-15,17,769,11,14617,28,15414
1363,unclassified,pfas_chain:FT_n1_OP,7.56114,9.128208e-08,14,772,35,14593,49,15414
1336,unclassified,pfas_bond:S(=O)O_sulfonicAcid_acyclic_(chain)_SCF,6.018822,5.810384e-43,115,671,405,14223,520,15414
1332,unclassified,pfas_bond:PC_phosphorus_organo_generic_CF,5.40705,5.046931e-07,16,770,56,14572,72,15414
1388,unclassified,pfas_chain:perF-linear_cap_C11_excl_mod,4.10721,3.077364e-06,18,768,83,14545,101,15414
1340,unclassified,pfas_bond:S=O_sulfonyl_generic_CF,3.776462,5.019026e-66,367,419,2754,11874,3121,15414
1335,unclassified,pfas_bond:S(=O)N_sulfonylamide_SCF,3.541862,1.041976e-47,251,535,1711,12917,1962,15414
1339,unclassified,pfas_bond:S~N_generic_CF,3.541521,7.693987999999999e-48,252,534,1720,12908,1972,15414
1399,unclassified,pfas_chain:perF-linear_cap_C9_plus,3.037849,4.417905e-26,152,634,1070,13558,1222,15414
1389,unclassified,pfas_chain:perF-linear_cap_C12_plus,3.02516,7.393521e-15,78,708,514,14114,592,15414


In [58]:
txp_enrich1[txp_enrich1['Category'] == 'Other aliphatics'].sort_values(by=['Odds Ratio','P-Value'], ascending = [False, False])

Unnamed: 0,Category,TxP,Odds Ratio,P-Value,TP,FN,FP,TN,Tested Active,All Tested
389,Other aliphatics,pfas_bond:C#N_nitrile_generic_CF,inf,1.624615e-12,45,8397,0,6972,45,15414
431,Other aliphatics,pfas_bond:quatN_alkyl_acyclic_CF,inf,2.097602e-15,56,8386,0,6972,56,15414
406,Other aliphatics,pfas_bond:CN_amine_pri-NH2_alkyl_CF,38.192711,1.941128e-11,46,8396,1,6971,47,15414
423,Other aliphatics,pfas_bond:F~Z_heteroatom_SF,34.021069,3.570541e-10,41,8401,1,6971,42,15414
505,Other aliphatics,pfas_chain:polyF_nocap_CFCH2CF,15.791413,3.904337e-12,57,8385,3,6969,60,15414
414,Other aliphatics,pfas_bond:COC_ether_alkenyl_OCF,13.331044,2.046149e-27,143,8299,9,6963,152,15414
510,Other aliphatics,pfas_ring:hetero_[3]_O_epoxide_CF,7.752922,1.286247e-09,56,8386,6,6966,62,15414
422,Other aliphatics,pfas_bond:F~Z_heteroatom_S,7.300387,1.243364e-22,148,8294,17,6955,165,15414
427,Other aliphatics,pfas_bond:P=O_phosphate_alkyl_ester_CF,6.6283,1.208608e-05,32,8410,4,6968,36,15414
396,Other aliphatics,pfas_bond:C(=O)O_carboxylicEster_acyclic_OCCF,6.143495,4.4940349999999997e-38,282,8160,39,6933,321,15414


In [59]:
txp_enrich1.to_csv(external_dir+'PFAS_TxP_enrichments_primary_categories_061123.csv')