# Exergy Calculator
#### Must have excel file with a "SMILES" Column. 

In [21]:
# Load 
import sys, pathlib
import os
from rdkit import Chem
from rdkit.Chem import Draw
import pandas as pd

from Exergy_Bones.group_loader import load_smarts_groups_from_excel
import glob
from padelpy import padeldescriptor
xml_files = glob.glob("./Exergy_Bones/fingerprints_xml/*.xml")
xml_files.sort()
FP_list = ['AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']
fp = dict(zip(FP_list, xml_files))
smarts_db = load_smarts_groups_from_excel()

# Enter in your file path

In [22]:
# USER ENTERED VARIABLES
# Enter excel path
file_path="./Data/processed/Exergy_Clean_Test.csv" #full dataset
# Enter column names:
smilecol_name = 'SMILES'           # Column containing SMILES strings (REQUIRED)
chemidcol_name = 'CASN'            # Column for identifiers (OPTIONAL - set to None if you don't have one)

In [24]:
# Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}\nPlease check the file path and try again.")

# Load based on file type
if file_path.endswith('.xlsx'):
    print("Loading Excel file...")
    df = pd.read_excel(file_path)
elif file_path.endswith('.csv'):
    print("Loading CSV file...")
    df = pd.read_csv(file_path)
else:
    raise ValueError(f"Unsupported file type. Please use .xlsx or .csv files.\nYour file: {file_path}")

# Validate SMILES column (REQUIRED)
if smilecol_name not in df.columns:
    raise ValueError(f"Column '{smilecol_name}' not found in file.\nAvailable columns: {list(df.columns)}")

# Handle optional ID column
if chemidcol_name is None or chemidcol_name not in df.columns:
    if chemidcol_name is not None:
        print(f"⚠ Column '{chemidcol_name}' not found. Auto-generating molecule IDs...")
    else:
        print("⚠ No ID column specified. Auto-generating molecule IDs...")
    
    # Create auto-generated IDs
    df['Molecule_ID'] = [f"Molecule_{i+1}" for i in range(len(df))]
    chemidcol_name = 'Molecule_ID'
    print(f"✓ Created '{chemidcol_name}' column")

# Show summary
print(f"✓ Successfully loaded {len(df)} molecules")
print(f"✓ Using '{smilecol_name}' for SMILES and '{chemidcol_name}' for identifiers")
print("\nFirst few rows:")
display(df[[chemidcol_name, smilecol_name]].head(3))

Loading CSV file...
✓ Successfully loaded 2609 molecules
✓ Using 'SMILES' for SMILES and 'CASN' for identifiers

First few rows:


Unnamed: 0,CASN,SMILES
0,514-10-3,CC(C)C1=CC2=CC[C@@H]3[C@@]([C@H]2CC1)(CCC[C@@]...
1,64-19-7,CC(=O)O
2,79-10-7,C=CC(=O)O


## generate padel descriptor


In [25]:

exergy_smi="exergy_test"

In [3]:
df[[smilecol_name,chemidcol_name]].to_csv(f'./Data/{exergy_smi}.smi', sep='\t', index=False, header=False)

for fingerprint in ['AtomPairs2D','AtomPairs2DCount']:
    #fingerprint = 'AtomPairs2D'

    fingerprint_output_file = f'./Data/{exergy_smi}_{fingerprint}.csv' #Substructure.csv
    fingerprint_descriptortypes = fp[fingerprint]

    padeldescriptor(mol_dir=f'./Data/{exergy_smi}.smi', 
                    d_file=fingerprint_output_file, #'Substructure.csv'
                    #descriptortypes='SubstructureFingerprint.xml', 
                    descriptortypes= fingerprint_descriptortypes,
                    detectaromaticity=True,
                    standardizenitro=True,
                    standardizetautomers=True,
                    threads=2,
                    removesalt=True,
                    log=True,
                    fingerprints=True)

In [26]:
group_names = [g.name for g in smarts_db.groups]

padelcount = pd.read_csv(f'./Data/{exergy_smi}_AtomPairs2DCount.csv')
padelhit = pd.read_csv(f'./Data/{exergy_smi}_AtomPairs2D.csv')
# add group counts padelcount[padelcount['Name']==casrn]
for gname in group_names:
    df[gname + " (calc)"] = 0

for idx, row in df.iterrows():
    smi = row[smilecol_name]
    casrn = row[chemidcol_name]
    if not isinstance(smi, str) or not smi.strip():
        continue
    res = smarts_db.analyze_smiles(smi,padelcount[padelcount['Name']==casrn],padelhit[padelhit['Name']==casrn])
    count_map = {r["name"]: r["count"] for r in res}
    for gname in group_names:
        df.at[idx, gname + " (calc)"] = count_map.get(gname, 0)

# now run your existing attach_exergy_columns
from Exergy_Bones.exergy_calc import attach_exergy_columns

df_exergy = attach_exergy_columns(df, smarts_db)

df_exergy.head()

Unnamed: 0,No.,Chemical Name,SMILES,CASN,Formula,Group 1 (calc),Group 2 (calc),Group 3 (calc),Group 4 (calc),Group 5 (calc),...,Group 73 (calc),Group 74 (calc),Group 75 (calc),Group 76 (calc),Group 77 (calc),Group 78 (calc),Eq4_term,Sum_H_groups,Sum_S_groups,Exergy (kJ/mol)
0,1,abietic acid,CC(C)C1=CC2=CC[C@@H]3[C@@]([C@H]2CC1)(CCC[C@@]...,514-10-3,C20H30O2,0,0,0,1,0,...,0,0,0,0,0,0,11750.67,-592.5103,-1.6362,11645.99273
1,2,acetic acid,CC(=O)O,64-19-7,C2H4O2,0,0,0,1,0,...,0,0,0,0,0,0,1296.69,-405.9447,-0.1452,934.03668
2,3,acrylic acid,C=CC(=O)O,79-10-7,C3H4O2,0,0,0,1,0,...,0,0,0,0,0,0,1706.95,-368.5643,-0.1636,1387.16304
3,4,adiponitrile,C(CCC#N)CC#N,111-69-3,C6H8N2,0,0,0,0,0,...,0,0,0,0,0,0,3406.68,169.2373,-0.3015,3665.809525
4,5,alpha-methylstyrene,CC(=C)C1=CC=CC=C1,98-83-9,C9H10,0,0,0,0,0,...,0,0,0,0,0,0,4872.84,106.7226,-0.2826,5063.81979


In [28]:
df_exergy[[x for x in df_exergy.columns if 'calc' in x]]
df_exergy

Unnamed: 0,No.,Chemical Name,SMILES,CASN,Formula,Group 1 (calc),Group 2 (calc),Group 3 (calc),Group 4 (calc),Group 5 (calc),...,Group 73 (calc),Group 74 (calc),Group 75 (calc),Group 76 (calc),Group 77 (calc),Group 78 (calc),Eq4_term,Sum_H_groups,Sum_S_groups,Exergy (kJ/mol)
0,1,abietic acid,CC(C)C1=CC2=CC[C@@H]3[C@@]([C@H]2CC1)(CCC[C@@]...,514-10-3,C20H30O2,0,0,0,1,0,...,0,0,0,0,0,0,11750.670,-592.5103,-1.6362,11645.992730
1,2,acetic acid,CC(=O)O,64-19-7,C2H4O2,0,0,0,1,0,...,0,0,0,0,0,0,1296.690,-405.9447,-0.1452,934.036680
2,3,acrylic acid,C=CC(=O)O,79-10-7,C3H4O2,0,0,0,1,0,...,0,0,0,0,0,0,1706.950,-368.5643,-0.1636,1387.163040
3,4,adiponitrile,C(CCC#N)CC#N,111-69-3,C6H8N2,0,0,0,0,0,...,0,0,0,0,0,0,3406.680,169.2373,-0.3015,3665.809525
4,5,alpha-methylstyrene,CC(=C)C1=CC=CC=C1,98-83-9,C9H10,0,0,0,0,0,...,0,0,0,0,0,0,4872.840,106.7226,-0.2826,5063.819790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,3143,"3,5-xylenol",CC1=CC(=CC(=C1)O)C,108-68-9,C8H10O,0,0,0,0,0,...,0,0,0,0,0,0,4464.565,-166.4570,-0.4509,4432.543835
2605,3145,"2,6-xylenol",CC1=C(C(=CC=C1)C)O,576-26-1,C8H10O,0,0,0,0,0,...,0,0,0,0,0,0,4464.565,-165.1900,-0.4563,4435.420845
2606,3146,"2,5-xylenol",CC1=CC(=C(C=C1)C)O,95-87-4,C8H10O,0,0,0,0,0,...,0,0,0,0,0,0,4464.565,-165.8235,-0.4536,4433.982340
2607,3147,"2,3-xylenol",CC1=C(C(=CC=C1)O)C,526-75-0,C8H10O,0,0,0,0,0,...,0,0,0,0,0,0,4464.565,-165.8235,-0.4536,4433.982340


## Save output file

In [None]:
output_path="./Data/processed/Output.xlsx"
df_exergy.to_excel(output_path, index=False)