# Exergy Calculator
#### Must have excel file with a "SMILES" Column. 
###### Can also include "ID", "CASN", "Formula" columns

In [11]:
import sys
import pathlib
import os
# Add parent directory to Python path so it can find Exergy_Bones
parent_dir = pathlib.Path().resolve().parent
sys.path.insert(0, str(parent_dir))

# Now your imports will work
from rdkit import Chem
from rdkit.Chem import Draw
import pandas as pd
from Exergy_Bones.group_loader import load_smarts_groups_from_excel
import glob
from padelpy import padeldescriptor

xml_files = glob.glob("../Exergy_Bones/fingerprints_xml/*.xml")
xml_files.sort()

FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

fp = dict(zip(FP_list, xml_files))
smarts_db = load_smarts_groups_from_excel()

read data file

In [12]:
# Enter excel path
file_path="../Data/processed/Exergy_Clean_Test.csv" #full dataset
#test sample: ./Data/raw/Test_sample_excel.xlsx
if '.xlsx' in file_path:
    df = pd.read_excel(file_path)
elif '.csv' in file_path:
    df = pd.read_csv(file_path)
smilecol_name = 'SMILES'
chemidcol_name = 'CASN'
print("done")

done


# generate padel descriptor

In [13]:
df[[smilecol_name,chemidcol_name]].to_csv('../Data/exergy_test.smi', sep='\t', index=False, header=False)

for fingerprint in ['AtomPairs2D','AtomPairs2DCount']:
    #fingerprint = 'AtomPairs2D'

    fingerprint_output_file = f'./Data/exergy_test_{fingerprint}.csv' #Substructure.csv
    fingerprint_descriptortypes = fp[fingerprint]

    padeldescriptor(mol_dir='./Data/exergy_test.smi', 
                    d_file=fingerprint_output_file, #'Substructure.csv'
                    #descriptortypes='SubstructureFingerprint.xml', 
                    descriptortypes= fingerprint_descriptortypes,
                    detectaromaticity=True,
                    standardizenitro=True,
                    standardizetautomers=True,
                    threads=2,
                    removesalt=True,
                    log=True,
                    fingerprints=True)

C:\Users\mmarc\Documents\Exergy\NoteBooks


In [18]:
group_names = [g.name for g in smarts_db.groups]

padelcount = pd.read_csv(f'../Data/exergy_test_AtomPairs2DCount.csv')
padelhit = pd.read_csv(f'../Data/exergy_test_AtomPairs2D.csv')
# add group counts padelcount[padelcount['Name']==casrn]
for gname in group_names:
    df[gname + " (calc)"] = 0

for idx, row in df.iterrows():
    smi = row[smilecol_name]
    casrn = row[chemidcol_name]
    if not isinstance(smi, str) or not smi.strip():
        continue
    res = smarts_db.analyze_smiles(smi,padelcount[padelcount['Name']==casrn],padelhit[padelhit['Name']==casrn])
    count_map = {r["name"]: r["count"] for r in res}
    for gname in group_names:
        df.at[idx, gname + " (calc)"] = count_map.get(gname, 0)

# now run your existing attach_exergy_columns
from Exergy_Bones.exergy_calc import attach_exergy_columns

df_exergy = attach_exergy_columns(df, smarts_db)

df_exergy.head()

Unnamed: 0,No.,Chemical Name,SMILES,CASN,Formula,Group 1 (calc),Group 2 (calc),Group 3 (calc),Group 4 (calc),Group 5 (calc),...,Group 73 (calc),Group 74 (calc),Group 75 (calc),Group 76 (calc),Group 77 (calc),Group 78 (calc),Eq4_term,Sum_H_groups,Sum_S_groups,Exergy (kJ/mol)
0,1,abietic acid,CC(C)C1=CC2=CC[C@@H]3[C@@]([C@H]2CC1)(CCC[C@@]...,514-10-3,C20H30O2,0,0,0,1,0,...,0,0,0,0,0,0,11750.67,-592.5103,-1.6362,11645.99273
1,2,acetic acid,CC(=O)O,64-19-7,C2H4O2,0,0,0,1,0,...,0,0,0,0,0,0,1296.69,-405.9447,-0.1452,934.03668
2,3,acrylic acid,C=CC(=O)O,79-10-7,C3H4O2,0,0,0,1,0,...,0,0,0,0,0,0,1706.95,-368.5643,-0.1636,1387.16304
3,4,adiponitrile,C(CCC#N)CC#N,111-69-3,C6H8N2,0,0,0,0,0,...,0,0,0,0,0,0,3406.68,169.2373,-0.3015,3665.809525
4,5,alpha-methylstyrene,CC(=C)C1=CC=CC=C1,98-83-9,C9H10,0,0,0,0,0,...,0,0,0,0,0,0,4872.84,106.7226,-0.2826,5063.81979


verify with the benchmarking set

In [19]:
combined = pd.concat([df[[x for x in df.columns if 'calc' not in x]],df_exergy[[x for x in df_exergy.columns if 'calc' in x]]],axis=1)

In [20]:
print("Group contains incorrect results")
for i in range(1,79):
    col_true = f'Group {i}'
    col_cal = f'Group {i} (calc)'
    subdf = combined[combined[col_true]!=combined[col_cal]]
    if len(subdf)>0:
        print(col_true)

Group contains incorrect results
Group 19
Group 26
Group 30
Group 52
Group 66


In [21]:
groupindex = 52
col_true = f'Group {groupindex}'
col_cal = f'Group {groupindex} (calc)'
combined[(combined[col_true]!=combined[col_cal])][[col_true,col_cal,'SMILES','CASN','Chemical Name']]

Unnamed: 0,Group 52,Group 52 (calc),SMILES,CASN,Chemical Name
1920,1,0,CC(C)(C)CC(C)(C)S,141-59-3,tert-octyl mercaptan
2489,0,1,CCC1=C(C=C(C(=C1)C)C)C,17851-27-3,"1,2,4-trimethyl-6-ethylbenzene"


In [22]:
df_exergy[[x for x in df_exergy.columns if 'calc' in x]]

Unnamed: 0,Group 1 (calc),Group 2 (calc),Group 3 (calc),Group 4 (calc),Group 5 (calc),Group 6 (calc),Group 7 (calc),Group 8 (calc),Group 9 (calc),Group 10 (calc),...,Group 69 (calc),Group 70 (calc),Group 71 (calc),Group 72 (calc),Group 73 (calc),Group 74 (calc),Group 75 (calc),Group 76 (calc),Group 77 (calc),Group 78 (calc)
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2605,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2606,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2607,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
output_path="../Data/processed/Test_output.xlsx"
df_exergy.to_excel(output_path, index=False)

  df_exergy.to_excel(output_path, index=False)


In [24]:

df_exergy

Unnamed: 0,No.,Chemical Name,SMILES,CASN,Formula,Group 1 (calc),Group 2 (calc),Group 3 (calc),Group 4 (calc),Group 5 (calc),...,Group 73 (calc),Group 74 (calc),Group 75 (calc),Group 76 (calc),Group 77 (calc),Group 78 (calc),Eq4_term,Sum_H_groups,Sum_S_groups,Exergy (kJ/mol)
0,1,abietic acid,CC(C)C1=CC2=CC[C@@H]3[C@@]([C@H]2CC1)(CCC[C@@]...,514-10-3,C20H30O2,0,0,0,1,0,...,0,0,0,0,0,0,11750.670,-592.5103,-1.6362,11645.992730
1,2,acetic acid,CC(=O)O,64-19-7,C2H4O2,0,0,0,1,0,...,0,0,0,0,0,0,1296.690,-405.9447,-0.1452,934.036680
2,3,acrylic acid,C=CC(=O)O,79-10-7,C3H4O2,0,0,0,1,0,...,0,0,0,0,0,0,1706.950,-368.5643,-0.1636,1387.163040
3,4,adiponitrile,C(CCC#N)CC#N,111-69-3,C6H8N2,0,0,0,0,0,...,0,0,0,0,0,0,3406.680,169.2373,-0.3015,3665.809525
4,5,alpha-methylstyrene,CC(=C)C1=CC=CC=C1,98-83-9,C9H10,0,0,0,0,0,...,0,0,0,0,0,0,4872.840,106.7226,-0.2826,5063.819790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,3143,"3,5-xylenol",CC1=CC(=CC(=C1)O)C,108-68-9,C8H10O,0,0,0,0,0,...,0,0,0,0,0,0,4464.565,-166.4570,-0.4509,4432.543835
2605,3145,"2,6-xylenol",CC1=C(C(=CC=C1)C)O,576-26-1,C8H10O,0,0,0,0,0,...,0,0,0,0,0,0,4464.565,-165.1900,-0.4563,4435.420845
2606,3146,"2,5-xylenol",CC1=CC(=C(C=C1)C)O,95-87-4,C8H10O,0,0,0,0,0,...,0,0,0,0,0,0,4464.565,-165.8235,-0.4536,4433.982340
2607,3147,"2,3-xylenol",CC1=C(C(=CC=C1)O)C,526-75-0,C8H10O,0,0,0,0,0,...,0,0,0,0,0,0,4464.565,-165.8235,-0.4536,4433.982340
