In [None]:
import pandas as pd

# path
csv_path = "/home/omid/Documents/Data/phenolics.csv"

# Load CSV
df = pd.read_csv(csv_path, header=None)  # header=None if your CSV has no column names
df.columns = ["Compound"]  # Name the column

# Check first rows and total compounds
print(df.head())
print(f"Total compounds: {len(df)}")


           Compound
0          Compound
1      Caffeic acid
2  Chlorogenic acid
3   o-Coumaric acid
4   m-Coumaric acid
Total compounds: 101


In [39]:
import pandas as pd
import os

# Path to your CSV (adjust if needed)
csv_path = "/home/omid/Documents/Data/phenolics.csv"

# Load the CSV
df = pd.read_csv(csv_path)
print("First 5 rows of your data:")
print(df.head())

# Extract compound names as a Python list
compounds = df['Compound'].tolist()
print(f"\nTotal compounds: {len(compounds)}")
print("Some example compounds:", compounds[:5])

# 4Save a simple text file with compounds for MOPAC input
output_txt = "/home/omid/Documents/Data/phenolics_list.txt"
with open(output_txt, "w") as f:
    for cmpd in compounds:
        f.write(cmpd + "\n")
print(f"\nCompound names saved to: {output_txt}")


First 5 rows of your data:
           Compound
0      Caffeic acid
1  Chlorogenic acid
2   o-Coumaric acid
3   m-Coumaric acid
4   p-Coumaric acid

Total compounds: 100
Some example compounds: ['Caffeic acid', 'Chlorogenic acid', 'o-Coumaric acid', 'm-Coumaric acid', 'p-Coumaric acid']

Compound names saved to: /home/omid/Documents/Data/phenolics_list.txt


In [4]:
import os
import pandas as pd
import subprocess
import pubchempy as pcp     
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools

    

In [5]:
data_path = "/home/omid/Documents/Data/phenolics.csv"
mopac_path = "/opt/mopac/bin/mopac"  # mopac_path
output_folder = "/home/omid/Documents/Data/mopac_inputs/"
os.makedirs(output_folder, exist_ok=True)

In [6]:
# --- Load compounds ---
df = pd.read_csv(data_path)
compounds = df['Compound'].tolist()

# Obtaining the XYZ coordinates with Open Babel 

In [7]:
import os
import subprocess
import pandas as pd
import pubchempy as pcp
import time
import re   

In [8]:
# -----------------------------
# CONFIG
# -----------------------------
BASE_DIR = "/home/omid/Documents/Data"
INPUT_CSV = f"{BASE_DIR}/phenolics.csv"
MOPAC_BIN = "/opt/mopac/bin/mopac"

XYZ_DIR = f"{BASE_DIR}/xyz"
MOP_DIR = f"{BASE_DIR}/mopac_inputs"
OUT_DIR = f"{BASE_DIR}/mopac_outputs"

os.makedirs(XYZ_DIR, exist_ok=True)
os.makedirs(MOP_DIR, exist_ok=True)
os.makedirs(OUT_DIR, exist_ok=True)


In [9]:
# -----------------------------
# HELPERS
# -----------------------------
def get_smiles(name):
    compounds = pcp.get_compounds(name, "name")
    if not compounds:
        raise ValueError("SMILES not found")
    return compounds[0].isomeric_smiles

def smiles_to_xyz(smiles, xyz_file):
    cmd = ["obabel", f"-:{smiles}", "--gen3d", "-O", xyz_file]
    subprocess.run(cmd, check=True)

def xyz_to_mop(xyz_file, mop_file, title):
    with open(xyz_file) as f:
        lines = f.readlines()[2:]  # skip atom count + comment

    with open(mop_file, "w") as f:
        f.write("PM7 EF PRECISE\n")
        f.write(f"{title}\n")
        for line in lines:
            f.write(line)

def run_mopac(mop_file):
    subprocess.run([MOPAC_BIN, mop_file], cwd=OUT_DIR, check=True)

def parse_arc(arc_file):
    hof = homo = lumo = dipole = None

    with open(arc_file) as f:
        text = f.read()

    m = re.search(r"FINAL HEAT OF FORMATION\s+=\s+([-.\d]+)", text)
    if m: hof = float(m.group(1))

    m = re.search(r"HOMO\s+=\s+([-.\d]+)", text)
    if m: homo = float(m.group(1))

    m = re.search(r"LUMO\s+=\s+([-.\d]+)", text)
    if m: lumo = float(m.group(1))

    m = re.search(r"DIPOLE MOMENT\s+=\s+([-.\d]+)", text)
    if m: dipole = float(m.group(1))

    return hof, homo, lumo, dipole

In [10]:
# -----------------------------
# MAIN
# -----------------------------
df = pd.read_csv(INPUT_CSV)
results = []

for name in df["Compound"]:
    safe = name.replace(" ", "_").replace("/", "_")

    try:
        print(f"Processing: {name}")

        smiles = get_smiles(name)
        xyz = f"{XYZ_DIR}/{safe}.xyz"
        mop = f"{MOP_DIR}/{safe}.mop"

        smiles_to_xyz(smiles, xyz)
        xyz_to_mop(xyz, mop, name)

        run_mopac(mop)

        arc = f"{OUT_DIR}/{safe}.arc"
        hof, homo, lumo, dipole = parse_arc(arc)

        results.append([name, hof, homo, lumo, dipole])
        time.sleep(1)

    except Exception as e:
        print(f"FAILED: {name} → {e}")
        results.append([name, None, None, None, None])

Processing: Caffeic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Caffeic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: Caffeic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Caffeic_acid.arc'
Processing: Chlorogenic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Chlorogenic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: Chlorogenic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Chlorogenic_acid.arc'
Processing: o-Coumaric acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/o-Coumaric_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: o-Coumaric acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/o-Coumaric_acid.arc'
Processing: m-Coumaric acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/m-Coumaric_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: m-Coumaric acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/m-Coumaric_acid.arc'
Processing: p-Coumaric acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/p-Coumaric_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: p-Coumaric acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/p-Coumaric_acid.arc'
Processing: Ferulic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Ferulic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: Ferulic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Ferulic_acid.arc'
Processing: Isoferulic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Isoferulic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: Isoferulic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Isoferulic_acid.arc'
Processing: trans-Cinnamic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/trans-Cinnamic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: trans-Cinnamic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/trans-Cinnamic_acid.arc'
Processing: Gallic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Gallic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: Gallic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Gallic_acid.arc'
Processing: Protocatechuic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Protocatechuic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: Protocatechuic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Protocatechuic_acid.arc'
Processing: 2,4-Hydroxybenzoic acid
FAILED: 2,4-Hydroxybenzoic acid → SMILES not found
Processing: o-Hydroxybenzoic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/o-Hydroxybenzoic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: o-Hydroxybenzoic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/o-Hydroxybenzoic_acid.arc'
Processing: m-Hydroxybenzoic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/m-Hydroxybenzoic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: m-Hydroxybenzoic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/m-Hydroxybenzoic_acid.arc'
Processing: p-Hydroxybenzoic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/p-Hydroxybenzoic_acid.mop" ended normally on Jan  2, 2026, at 22:38.

FAILED: p-Hydroxybenzoic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/p-Hydroxybenzoic_acid.arc'
Processing: Syringic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Syringic_acid.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Syringic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Syringic_acid.arc'
Processing: Vanillic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Vanillic_acid.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Vanillic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Vanillic_acid.arc'
Processing: Benzoic acid


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Benzoic_acid.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Benzoic acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Benzoic_acid.arc'
Processing: (−)-Epigallocatechin gallate (EGCG)
FAILED: (−)-Epigallocatechin gallate (EGCG) → SMILES not found
Processing: (−)-Epicatechin gallate (ECG)
FAILED: (−)-Epicatechin gallate (ECG) → SMILES not found
Processing: (−)-Epigallocatechin (EGC)
FAILED: (−)-Epigallocatechin (EGC) → SMILES not found
Processing: (−)-Epicatechin (EC)
FAILED: (−)-Epicatechin (EC) → SMILES not found
Processing: (+)-Catechin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/(+)-Catechin.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: (+)-Catechin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/(+)-Catechin.arc'
Processing: Myricetin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Myricetin.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Myricetin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Myricetin.arc'
Processing: Quercetin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Quercetin.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Quercetin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Quercetin.arc'
Processing: Morin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Morin.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Morin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Morin.arc'
Processing: Kaempferol


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Kaempferol.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Kaempferol → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Kaempferol.arc'
Processing: Galangin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Galangin.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Galangin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Galangin.arc'
Processing: Quercetin-3-glucoside


  return compounds[0].isomeric_smiles
1 molecule converted
          Fri Jan  2 22:39:31 2026  Job: '/home/omid/Documents/Data/mopac_inputs/Quercetin-3-glucoside' started successfully




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Quercetin-3-glucoside.mop" ended normally on Jan  2, 2026, at 22:39.

FAILED: Quercetin-3-glucoside → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Quercetin-3-glucoside.arc'
Processing: Quercetin-3-rutinoside (rutin)
FAILED: Quercetin-3-rutinoside (rutin) → SMILES not found
Processing: Quercetin-3-rhamnoside (quercitrin)
FAILED: Quercetin-3-rhamnoside (quercitrin) → SMILES not found
Processing: Kaempferol-3-glucoside (astragalin)
FAILED: Kaempferol-3-glucoside (astragalin) → SMILES not found
Processing: Quercetin-3-glucoside-7-rhamnoside


  return compounds[0].isomeric_smiles
1 molecule converted
          Fri Jan  2 22:39:48 2026  Job: '/home/omid/Documents/Data/mopac_inputs/Quercetin-3-glucoside-7-rhamnoside' started successfully




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Quercetin-3-glucoside-7-rhamnoside.mop" ended normally on Jan  2, 2026, at 22:40.

FAILED: Quercetin-3-glucoside-7-rhamnoside → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Quercetin-3-glucoside-7-rhamnoside.arc'
Processing: Flavonol (ck)
FAILED: Flavonol (ck) → SMILES not found
Processing: Butein


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Butein.mop" ended normally on Jan  2, 2026, at 22:40.

FAILED: Butein → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Butein.arc'
Processing: Phloretin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Phloretin.mop" ended normally on Jan  2, 2026, at 22:40.

FAILED: Phloretin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Phloretin.arc'
Processing: Sappanchalcone


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Sappanchalcone.mop" ended normally on Jan  2, 2026, at 22:40.

FAILED: Sappanchalcone → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Sappanchalcone.arc'
Processing: Carthamin


  return compounds[0].isomeric_smiles
1 molecule converted
          Fri Jan  2 22:40:52 2026  Job: '/home/omid/Documents/Data/mopac_inputs/Carthamin' started successfully




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Carthamin.mop" ended normally on Jan  2, 2026, at 22:41.

FAILED: Carthamin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Carthamin.arc'
Processing: trans-chalcone (ck)
FAILED: trans-chalcone (ck) → SMILES not found
Processing: Luteolin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Luteolin.mop" ended normally on Jan  2, 2026, at 22:41.

FAILED: Luteolin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Luteolin.arc'
Processing: Baicalein


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Baicalein.mop" ended normally on Jan  2, 2026, at 22:41.

FAILED: Baicalein → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Baicalein.arc'
Processing: Apigenin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Apigenin.mop" ended normally on Jan  2, 2026, at 22:41.

FAILED: Apigenin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Apigenin.arc'
Processing: Chrysin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Chrysin.mop" ended normally on Jan  2, 2026, at 22:41.

FAILED: Chrysin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Chrysin.arc'
Processing: Luteolin-7-glucoside


  return compounds[0].isomeric_smiles
1 molecule converted
          Fri Jan  2 22:41:57 2026  Job: '/home/omid/Documents/Data/mopac_inputs/Luteolin-7-glucoside' started successfully




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Luteolin-7-glucoside.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: Luteolin-7-glucoside → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Luteolin-7-glucoside.arc'
Processing: Apigenin-8-glucoside (vitexin)
FAILED: Apigenin-8-glucoside (vitexin) → SMILES not found
Processing: Apigenin-7-glucoside (apigetrin)
FAILED: Apigenin-7-glucoside (apigetrin) → SMILES not found
Processing: Baicalein-7-glucuronide (baicalin)
FAILED: Baicalein-7-glucuronide (baicalin) → SMILES not found
Processing: Flavone (ck)
FAILED: Flavone (ck) → SMILES not found
Processing: Naringenin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Naringenin.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: Naringenin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Naringenin.arc'
Processing: Hesperetin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Hesperetin.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: Hesperetin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Hesperetin.arc'
Processing: Naringenin-7-rutinoside (naringin)
FAILED: Naringenin-7-rutinoside (naringin) → SMILES not found
Processing: Hesperetin-7-rutinoside (hesperidin)
FAILED: Hesperetin-7-rutinoside (hesperidin) → SMILES not found
Processing: Flavanone (ck)
FAILED: Flavanone (ck) → SMILES not found
Processing: Genistein


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Genistein.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: Genistein → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Genistein.arc'
Processing: Daidzein


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Daidzein.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: Daidzein → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Daidzein.arc'
Processing: Glycitein


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Glycitein.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: Glycitein → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Glycitein.arc'
Processing: Genistein-7-glucoside (genistin)
FAILED: Genistein-7-glucoside (genistin) → SMILES not found
Processing: Daidzein-7-glucoside (daidzin)
FAILED: Daidzein-7-glucoside (daidzin) → SMILES not found
Processing: Isoflavone (ck)
FAILED: Isoflavone (ck) → SMILES not found
Processing: Catechin 3-O-gallate (monomer)
FAILED: Catechin 3-O-gallate (monomer) → SMILES not found
Processing: Procyanidin B-1 (dimer)
FAILED: Procyanidin B-1 (dimer) → SMILES not found
Processing: Procyanidin B-2 digallate (dimer)
FAILED: Procyanidin B-2 digallate (dimer) → SMILES not found
Processing: Procyanidin C-1 (trimer)
FAILED: Procyanidin C-1 (trimer) → SMILES not found
Processing: Chinese tannin
FAILED: Chinese tannin → SMILES not found
Processing: Corilagin (monomeric ell

  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Piceatannol.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: Piceatannol → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Piceatannol.arc'
Processing: Resveratrol


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Resveratrol.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: Resveratrol → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Resveratrol.arc'
Processing: Piceatannol-3V-glucoside
FAILED: Piceatannol-3V-glucoside → SMILES not found
Processing: Resveratrol-3-glucoside
FAILED: Resveratrol-3-glucoside → SMILES not found
Processing: Resveratrol-4V-glucoside
FAILED: Resveratrol-4V-glucoside → SMILES not found
Processing: trans-Stilbene


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/trans-Stilbene.mop" ended normally on Jan  2, 2026, at 22:42.

FAILED: trans-Stilbene → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/trans-Stilbene.arc'
Processing: Curcumine


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Curcumine.mop" ended normally on Jan  2, 2026, at 22:43.

FAILED: Curcumine → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Curcumine.arc'
Processing: Demethoxycurcumine
FAILED: Demethoxycurcumine → SMILES not found
Processing: Bisdemethoxycurcumine
FAILED: Bisdemethoxycurcumine → SMILES not found
Processing: Esculetin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Esculetin.mop" ended normally on Jan  2, 2026, at 22:43.

FAILED: Esculetin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Esculetin.arc'
Processing: Scopoletin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Scopoletin.mop" ended normally on Jan  2, 2026, at 22:43.

FAILED: Scopoletin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Scopoletin.arc'
Processing: Esculetin-6-glucoside
FAILED: Esculetin-6-glucoside → SMILES not found
Processing: 5-Methoxyfuranocoumarin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/5-Methoxyfuranocoumarin.mop" ended normally on Jan  2, 2026, at 22:43.

FAILED: 5-Methoxyfuranocoumarin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/5-Methoxyfuranocoumarin.arc'
Processing: Coumarin (ck)
FAILED: Coumarin (ck) → SMILES not found
Processing: Matairesinol


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Matairesinol.mop" ended normally on Jan  2, 2026, at 22:43.

FAILED: Matairesinol → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Matairesinol.arc'
Processing: Secoisolariciresinol


  return compounds[0].isomeric_smiles
1 molecule converted
          Fri Jan  2 22:43:40 2026  Job: '/home/omid/Documents/Data/mopac_inputs/Secoisolariciresinol' started successfully




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Secoisolariciresinol.mop" ended normally on Jan  2, 2026, at 22:43.

FAILED: Secoisolariciresinol → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Secoisolariciresinol.arc'
Processing: Arctigenin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Arctigenin.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Arctigenin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Arctigenin.arc'
Processing: Magnolol


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Magnolol.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Magnolol → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Magnolol.arc'
Processing: Purpurin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Purpurin.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Purpurin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Purpurin.arc'
Processing: Pseudopurpurin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Pseudopurpurin.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Pseudopurpurin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Pseudopurpurin.arc'
Processing: Alizarin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Alizarin.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Alizarin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Alizarin.arc'
Processing: Quinizarin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Quinizarin.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Quinizarin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Quinizarin.arc'
Processing: Emodin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Emodin.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Emodin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Emodin.arc'
Processing: Chrysazine


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Chrysazine.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Chrysazine → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Chrysazine.arc'
Processing: Rhein


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Rhein.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Rhein → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Rhein.arc'
Processing: Chrysophanol


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Chrysophanol.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Chrysophanol → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Chrysophanol.arc'
Processing: Physcion


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Physcion.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Physcion → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Physcion.arc'
Processing: Aloe-emodin


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Aloe-emodin.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: Aloe-emodin → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Aloe-emodin.arc'
Processing: 1,5-Dihydroxyanthraquinone


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/1,5-Dihydroxyanthraquinone.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: 1,5-Dihydroxyanthraquinone → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/1,5-Dihydroxyanthraquinone.arc'
Processing: 2,6-Dihydroxyanthraquinone


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/2,6-Dihydroxyanthraquinone.mop" ended normally on Jan  2, 2026, at 22:44.

FAILED: 2,6-Dihydroxyanthraquinone → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/2,6-Dihydroxyanthraquinone.arc'
Processing: Ruberythric acid


  return compounds[0].isomeric_smiles
1 molecule converted
          Fri Jan  2 22:44:42 2026  Job: '/home/omid/Documents/Data/mopac_inputs/Ruberythric_acid' started successfully




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Ruberythric_acid.mop" ended normally on Jan  2, 2026, at 22:45.

FAILED: Ruberythric acid → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Ruberythric_acid.arc'
Processing: Alizarin-2-glucoside
FAILED: Alizarin-2-glucoside → SMILES not found
Processing: Anthraquinone


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Anthraquinone.mop" ended normally on Jan  2, 2026, at 22:45.

FAILED: Anthraquinone → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Anthraquinone.arc'
Processing: Juglone


  return compounds[0].isomeric_smiles
1 molecule converted




          MOPAC Job: "/home/omid/Documents/Data/mopac_inputs/Juglone.mop" ended normally on Jan  2, 2026, at 22:45.

FAILED: Juglone → [Errno 2] No such file or directory: '/home/omid/Documents/Data/mopac_outputs/Juglone.arc'
Processing: Skikonin
FAILED: Skikonin → SMILES not found
Processing: Acetylskikonin
FAILED: Acetylskikonin → SMILES not found


In [11]:
# -----------------------------
# SAVE RESULTS
# -----------------------------
out_df = pd.DataFrame(results, columns=[
    "Compound", "Heat_of_Formation", "HOMO", "LUMO", "Dipole_Moment"
])

out_file = f"{BASE_DIR}/phenolics_PM7_descriptors.csv"
out_df.to_csv(out_file, index=False)

print(f"\nDONE ✅ Results saved to:\n{out_file}")


DONE ✅ Results saved to:
/home/omid/Documents/Data/phenolics_PM7_descriptors.csv


In [12]:
import shutil
shutil.which("mopac")


'/opt/mopac/bin/mopac'

In [14]:
import re

def parse_mopac_arc(arc_file):
    hof = homo = lumo = dipole = None

    with open(arc_file, "r") as f:
        for line in f:
            if "HEAT OF FORMATION" in line:
                hof = float(re.search(r"=\s+([-0-9.]+)", line).group(1))

            elif line.strip().startswith("DIPOLE"):
                dipole = float(re.search(r"=\s+([-0-9.]+)", line).group(1))

            elif "HOMO LUMO ENERGIES" in line:
                vals = re.findall(r"[-]?\d+\.\d+", line)
                homo, lumo = float(vals[0]), float(vals[1])

    return hof, homo, lumo, dipole


In [15]:
arc = "/home/omid/Documents/Data/mopac_inputs/1,5-Dihydroxyanthraquinone.arc"
print(parse_mopac_arc(arc))


(-107.08294, -9.392, -1.392, 2.37304)


In [17]:
rows = []

for compound in compounds:
    safe = compound.replace(" ", "_").replace(",", "")
    arc_file = f"{MOP_DIR}/{safe}.arc"

    if not os.path.exists(arc_file):
        continue

    hof, homo, lumo, dipole = parse_mopac_arc(arc_file)

    rows.append({
        "Compound": compound,
        "Heat_of_Formation": hof,
        "HOMO": homo,
        "LUMO": lumo,
        "Dipole_Moment": dipole
    })

import pandas as pd
df_desc = pd.DataFrame(rows)
df_desc.to_csv(
    "/home/omid/Documents/Data/phenolics_PM7_descriptors.csv",
    index=False
)

df_desc.head()


Unnamed: 0,Compound,Heat_of_Formation,HOMO,LUMO,Dipole_Moment
0,Caffeic acid,-72.08402,-8.061,-2.444,11.94241
1,Chlorogenic acid,-335.71654,-8.455,-1.265,4.67145
2,o-Coumaric acid,-4.3906,-8.569,-1.417,8.34206
3,m-Coumaric acid,-83.27963,-9.218,-0.956,5.71151
4,p-Coumaric acid,-20.22144,-8.678,-1.992,5.12332


In [18]:
import re

def parse_mopac_arc_extended(arc_file):
    hof = final_hof = homo = lumo = dipole = cosmo_area = core_core = None
    n_hydroxyl = 0
    atoms = []

    with open(arc_file, "r") as f:
        lines = f.readlines()

    # Read line by line
    for i, line in enumerate(lines):
        if "HEAT OF FORMATION" in line:
            hof = float(re.search(r"=\s+([-0-9.]+)", line).group(1))
        
        elif "HOMO LUMO ENERGIES" in line:
            vals = re.findall(r"[-]?\d+\.\d+", line)
            homo, lumo = float(vals[0]), float(vals[1])
        
        elif line.strip().startswith("DIPOLE"):
            dipole = float(re.search(r"=\s+([-0-9.]+)", line).group(1))
        
        elif "COSMO AREA" in line:
            cosmo_area = float(re.search(r"=\s+([-0-9.]+)", line).group(1))
        
        elif "CORE-CORE REPULSION" in line:
            core_core = float(re.search(r"=\s+([-0-9.]+)", line).group(1))
        
        # final heat of formation (sometimes same as HO formation, but keep separate)
        elif "FINAL GEOMETRY OBTAINED" in line:
            # next lines contain geometry
            geom_start = i+1
            for geom_line in lines[geom_start:]:
                if geom_line.strip() == "":
                    break
                # Count hydroxyl: O bonded to H
                if geom_line.startswith("O"):
                    atoms.append(geom_line.strip())
            # Simple heuristic: count O with H nearby
            n_hydroxyl = sum(1 for a in atoms if "+1" in a)

    return hof, final_hof, homo, lumo, dipole, cosmo_area, core_core, n_hydroxyl


In [27]:
import os
import pandas as pd

MOP_DIR = "/home/omid/Documents/Data/mopac_inputs"

rows = []
compounds = [line.strip() for line in open("/home/omid/Documents/Data/phenolics.csv")]

for compound in compounds:
    safe = compound.replace(" ", "_").replace(",", "")
    arc_file = f"{MOP_DIR}/{safe}.arc"
    if not os.path.exists(arc_file):
        print(f"Missing ARC: {compound}")
        continue
    
    hof, final_hof, homo, lumo, dipole, cosmo_area, core_core, n_hydroxyl = parse_mopac_arc_extended(arc_file)
    
    rows.append({
        "Compound": compound,
        "Heat_of_Formation": hof,
        "Final_Heat_of_Formation": final_hof,
        "HOMO": homo,
        "LUMO": lumo,
        "Dipole_Moment": dipole,
        "Cosmo_Area": cosmo_area,
        "Core_Core_Repulsion": core_core,
        "Num_Hydroxyl": n_hydroxyl
    })

df_desc = pd.DataFrame(rows)
df_desc.to_csv("/home/omid/Documents/Data/phenolics_PM7_descriptors_extended.csv", index=False)
df_desc.head()


Missing ARC: Compound
Missing ARC: "2,4-Hydroxybenzoic acid"
Missing ARC: (−)-Epigallocatechin gallate (EGCG)
Missing ARC: (−)-Epicatechin gallate (ECG)
Missing ARC: (−)-Epigallocatechin (EGC)
Missing ARC: (−)-Epicatechin (EC)
Missing ARC: Quercetin-3-rutinoside (rutin)
Missing ARC: Quercetin-3-rhamnoside (quercitrin)
Missing ARC: Kaempferol-3-glucoside (astragalin)
Missing ARC: Flavonol (ck)
Missing ARC: trans-chalcone (ck)
Missing ARC: Apigenin-8-glucoside (vitexin)
Missing ARC: Apigenin-7-glucoside (apigetrin)
Missing ARC: Baicalein-7-glucuronide (baicalin)
Missing ARC: Flavone (ck)
Missing ARC: Naringenin-7-rutinoside (naringin)
Missing ARC: Hesperetin-7-rutinoside (hesperidin)
Missing ARC: Flavanone (ck)
Missing ARC: Genistein-7-glucoside (genistin)
Missing ARC: Daidzein-7-glucoside (daidzin)
Missing ARC: Isoflavone (ck)
Missing ARC: Catechin 3-O-gallate (monomer)
Missing ARC: Procyanidin B-1 (dimer)
Missing ARC: Procyanidin B-2 digallate (dimer)
Missing ARC: Procyanidin C-1 (trim

Unnamed: 0,Compound,Heat_of_Formation,Final_Heat_of_Formation,HOMO,LUMO,Dipole_Moment,Cosmo_Area,Core_Core_Repulsion,Num_Hydroxyl
0,Caffeic acid,-72.08402,,-8.061,-2.444,11.94241,200.05,,0
1,Chlorogenic acid,-335.71654,,-8.455,-1.265,4.67145,349.78,,0
2,o-Coumaric acid,-4.3906,,-8.569,-1.417,8.34206,190.42,,0
3,m-Coumaric acid,-83.27963,,-9.218,-0.956,5.71151,186.06,,0
4,p-Coumaric acid,-20.22144,,-8.678,-1.992,5.12332,190.89,,0


In [37]:
import re
import math

def parse_mopac_arc_corrected(arc_file):
    hof = homo = lumo = dipole = cosmo_area = core_core = None
    atoms = []

    with open(arc_file, "r") as f:
        lines = f.readlines()

    in_geometry = False
    geom_lines = []

    for line in lines:
        # Heat of formation
        if "HEAT OF FORMATION" in line:
            hof = float(re.search(r"=\s+([-0-9.]+)", line).group(1))
        # HOMO / LUMO
        elif "HOMO LUMO ENERGIES" in line:
            vals = re.findall(r"[-]?\d+\.\d+", line)
            homo, lumo = float(vals[0]), float(vals[1])
        # Dipole
        elif line.strip().startswith("DIPOLE"):
            dipole = float(re.search(r"=\s+([-0-9.]+)", line).group(1))
        # Cosmo area
        elif "COSMO AREA" in line:
            cosmo_area = float(re.search(r"=\s+([-0-9.]+)", line).group(1))
        # Core-Core repulsion
        elif "CORE-CORE REPULSION" in line:
            core_core = float(re.search(r"=\s+([-0-9.]+)", line).group(1))
        # Start of final geometry
        elif "FINAL GEOMETRY OBTAINED" in line:
            in_geometry = True
            continue
        # Stop reading geometry at blank line
        elif in_geometry and line.strip() == "":
            in_geometry = False
        # Collect geometry lines
        if in_geometry:
            geom_lines.append(line.strip())

    # Process geometry to count hydroxyls
    # Hydroxyl = O atom bonded to H ~1 Å away
    atoms_coords = []
    for gl in geom_lines:
        if gl == "":
            continue
        parts = gl.split()
        if limport os
import re
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors

# ===== SETTINGS =====
data_dir = "/home/omid/Documents/Data"  # Your main data folder
mopac_out_dir = os.path.join(data_dir, "mopac_outputs")  # folder with .out files
csv_input = os.path.join(data_dir, "phenolics.csv")
csv_output = os.path.join(data_dir, "phenolics_PM7_descriptors_extended.csv")

# ===== HELPER FUNCTIONS =====
def parse_mopac_out(file_path):
    """Parse a MOPAC .out file and return a dictionary of descriptors"""
    data = {}
    try:
        with open(file_path, 'r') as f:
            text = f.read()

        # Heat of formation
        m = re.search(r'HEAT OF FORMATION\s*=\s*([-\d.]+)', text)
        data['Heat_of_Formation'] = float(m.group(1)) if m else None

        # HOMO and LUMO
        m = re.search(r'HOMO LUMO ENERGIES \(EV\)\s*=\s*([-\d.]+)\s+([-\d.]+)', text)
        if m:
            data['HOMO'] = float(m.group(1))
            data['LUMO'] = float(m.group(2))
            data['HOMO-LUMO_Gap'] = data['LUMO'] - data['HOMO']
            # Hardness & Softness
            data['Hardness'] = data['HOMO-LUMO_Gap'] / 2
            data['Softness'] = 1 / data['Hardness'] if data['Hardness'] != 0 else None
            # Chemical potential & Electrophilicity
            mu = (data['HOMO'] + data['LUMO']) / 2
            eta = data['Hardness']
            data['Electrophilicity'] = (mu**2)/(2*eta) if eta else None
        else:
            data['HOMO'] = data['LUMO'] = data['HOMO-LUMO_Gap'] = None
            data['Hardness'] = data['Softness'] = data['Electrophilicity'] = None

        # Dipole Moment
        m = re.search(r'DIPOLE\s*=\s*([-\d.]+)', text)
        data['Dipole_Moment'] = float(m.group(1)) if m else None

        # COSMO area and volume
        m = re.search(r'COSMO AREA\s*=\s*([-\d.]+)', text)
        data['COSMO_Area'] = float(m.group(1)) if m else None
        m = re.search(r'COSMO VOLUME\s*=\s*([-\d.]+)', text)
        data['COSMO_Volume'] = float(m.group(1)) if m else None

        # Core-Core Repulsion
        m = re.search(r'TOTAL CORE-CORE REPULSION\s*=\s*([-\d.]+)', text)
        data['Core_Core_Repulsion'] = float(m.group(1)) if m else None

        # Ionization potential
        m = re.search(r'IONIZATION POTENTIAL\s*=\s*([-\d.]+)', text)
        data['Ionization_Potential'] = float(m.group(1)) if m else None

        # Electron affinity (if printed)
        m = re.search(r'ELECTRON AFFINITY\s*=\s*([-\d.]+)', text)
        data['Electron_Affinity'] = float(m.group(1)) if m else None

        # Molecular weight
        m = re.search(r'MOLECULAR WEIGHT\s*=\s*([-\d.]+)', text)
        data['Mol_Weight'] = float(m.group(1)) if m else None

    except FileNotFoundError:
        print(f"[WARNING] Missing ARC file for: {file_path}")
        return None

    return data

def compute_rdkit_descriptors(smiles):
    """Compute molecular descriptors using RDKit"""
    data = {}
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return {}

        # Hydroxy groups
        data['Num_Hydroxy'] = len(mol.GetSubstructMatches(Chem.MolFromSmarts('[OX2H]')))
        # Carbonyl groups
        data['Num_Carbonyl'] = len(mol.GetSubstructMatches(Chem.MolFromSmarts('[CX3]=[OX1]')))
        # Aromatic rings
        data['Num_Aromatic_Rings'] = rdMolDescriptors.CalcNumAromaticRings(mol)
        # Rotatable bonds
        data['Num_Rotatable_Bonds'] = Descriptors.NumRotatableBonds(mol)
        # TPSA
        data['TPSA'] = rdMolDescriptors.CalcTPSA(mol)
        # LogP
        data['LogP'] = Descriptors.MolLogP(mol)
        # H-bond donors & acceptors
        data['H_Bond_Donors'] = rdMolDescriptors.CalcNumHBD(mol)
        data['H_Bond_Acceptors'] = rdMolDescriptors.CalcNumHBA(mol)
    except:
        return {}
    return data

# ===== LOAD CSV =====
df = pd.read_csv(csv_input)

# Add new columns
descriptor_cols = ['Heat_of_Formation','HOMO','LUMO','Dipole_Moment','COSMO_Area',
                   'COSMO_Volume','Core_Core_Repulsion','HOMO-LUMO_Gap','Ionization_Potential',
                   'Electron_Affinity','Hardness','Softness','Electrophilicity','Mol_Weight',
                   'Num_Hydroxy','Num_Carbonyl','Num_Aromatic_Rings','Num_Rotatable_Bonds',
                   'TPSA','LogP','H_Bond_Donors','H_Bond_Acceptors']
for col in descriptor_cols:
    df[col] = None

# ===== PROCESS EACH COMPOUND =====
for idx, row in df.iterrows():
    compound = row['Compound']
    # Construct expected .out file
    out_file = os.path.join(mopac_out_dir, f"{compound}.out")
    mopac_data = parse_mopac_out(out_file)
    if mopac_data:
        for k, v in mopac_data.items():
            df.at[idx, k] = v

    # Compute SMILES-based descriptors if available
    # Optional: you can store SMILES in CSV or define them here
    smiles = row.get('SMILES', None)
    if smiles:
        rdkit_data = compute_rdkit_descriptors(smiles)
        for k, v in rdkit_data.items():
            df.at[idx, k] = v

# ===== SAVE CSV =====
df.to_csv(csv_output, index=False)
print(f"Extended descriptors saved to {csv_output}")
en(parts) >= 4:
            element = parts[0]
            x, y, z = map(float, parts[1:4])
            atoms_coords.append((element, x, y, z))

    n_hydroxyl = 0
    for i, (elem1, x1, y1, z1) in enumerate(atoms_coords):
        if elem1 != "O":
            continue
        # Check for H within ~1.2 Å
        for j, (elem2, x2, y2, z2) in enumerate(atoms_coords):
            if elem2 != "H":
                continue
            dist = math.sqrt((x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2)
            if dist < 1.2:
                n_hydroxyl += 1
                break  # Only count once per O

    return hof, homo, lumo, dipole, cosmo_area, core_core, n_hydroxyl


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 187)

In [38]:
import os
import pandas as pd

MOP_DIR = "/home/omid/Documents/Data/mopac_inputs"

df_input = pd.read_csv("/home/omid/Documents/Data/phenolics.csv")
rows = []

for compound in df_input['Compound']:
    safe = compound.replace(" ", "_").replace(",", "")
    arc_file = os.path.join(MOP_DIR, f"{safe}.arc")
    if not os.path.exists(arc_file):
        print(f"Missing ARC: {compound}")
        continue
    
    hof, homo, lumo, dipole, cosmo_area, core_core, n_hydroxyl = parse_mopac_arc_corrected(arc_file)
    rows.append({
        "Compound": compound,
        "Heat_of_Formation": hof,
        "HOMO": homo,
        "LUMO": lumo,
        "Dipole_Moment": dipole,
        "Cosmo_Area": cosmo_area,
        "Core_Core_Repulsion": core_core,
        "Num_Hydroxyl": n_hydroxyl
    })

df_desc = pd.DataFrame(rows)
df_desc.to_csv("/home/omid/Documents/Data/phenolics_PM7_descriptors_extended.csv", index=False)
df_desc.head()


Missing ARC: 2,4-Hydroxybenzoic acid
Missing ARC: (−)-Epigallocatechin gallate (EGCG)
Missing ARC: (−)-Epicatechin gallate (ECG)
Missing ARC: (−)-Epigallocatechin (EGC)
Missing ARC: (−)-Epicatechin (EC)
Missing ARC: Quercetin-3-rutinoside (rutin)
Missing ARC: Quercetin-3-rhamnoside (quercitrin)
Missing ARC: Kaempferol-3-glucoside (astragalin)
Missing ARC: Flavonol (ck)
Missing ARC: trans-chalcone (ck)
Missing ARC: Apigenin-8-glucoside (vitexin)
Missing ARC: Apigenin-7-glucoside (apigetrin)
Missing ARC: Baicalein-7-glucuronide (baicalin)
Missing ARC: Flavone (ck)
Missing ARC: Naringenin-7-rutinoside (naringin)
Missing ARC: Hesperetin-7-rutinoside (hesperidin)
Missing ARC: Flavanone (ck)
Missing ARC: Genistein-7-glucoside (genistin)
Missing ARC: Daidzein-7-glucoside (daidzin)
Missing ARC: Isoflavone (ck)
Missing ARC: Catechin 3-O-gallate (monomer)
Missing ARC: Procyanidin B-1 (dimer)
Missing ARC: Procyanidin B-2 digallate (dimer)
Missing ARC: Procyanidin C-1 (trimer)
Missing ARC: Chinese

Unnamed: 0,Compound,Heat_of_Formation,HOMO,LUMO,Dipole_Moment,Cosmo_Area,Core_Core_Repulsion,Num_Hydroxyl
0,Caffeic acid,-72.08402,-8.061,-2.444,11.94241,200.05,,3
1,Chlorogenic acid,-335.71654,-8.455,-1.265,4.67145,349.78,,6
2,o-Coumaric acid,-4.3906,-8.569,-1.417,8.34206,190.42,,2
3,m-Coumaric acid,-83.27963,-9.218,-0.956,5.71151,186.06,,2
4,p-Coumaric acid,-20.22144,-8.678,-1.992,5.12332,190.89,,2


# The Topological Feature Extraction 

In [1]:
import pandas as pd
import numpy as np
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem.EState import EState_VSA

In [2]:
INPUT_CSV = "phenolics_PM7_descriptors.csv"   # must contain 'Compound'
OUTPUT_CSV = "phenolics_full_descriptors.csv"

In [10]:
import pandas as pd
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen, rdMolDescriptors

df = pd.read_csv("phenolics_PM7_descriptors_extended.csv")

smiles_list = []

for name in df["Compound"]:
    try:
        compounds = pcp.get_compounds(name, "name")
        smiles = compounds[0].canonical_smiles if compounds else None
    except:
        smiles = None
    smiles_list.append(smiles)

df["SMILES"] = smiles_list
df.to_csv("phenolics_with_smiles.csv", index=False)


  smiles = compounds[0].canonical_smiles if compounds else None


In [11]:
from rdkit.Chem import Lipinski, Descriptors

results = []

for smi in df["SMILES"]:
    mol = Chem.MolFromSmiles(smi) if pd.notna(smi) else None
    if mol is None:
        results.append([None]*5)
        continue

    num_hydroxyl = sum(
        1 for atom in mol.GetAtoms()
        if atom.GetSymbol() == "O" and atom.GetTotalNumHs() > 0
    )

    results.append([
        num_hydroxyl,
        rdMolDescriptors.CalcNumAromaticRings(mol),
        rdMolDescriptors.CalcTPSA(mol),
        Crippen.MolLogP(mol),
        Descriptors.MolWt(mol),
    ])

desc_df = pd.DataFrame(results, columns=[
    "Num_Hydroxyl",
    "Num_AromaticRings",
    "TPSA",
    "LogP",
    "MolWt"
])

final_df = pd.concat([df, desc_df], axis=1)
final_df.to_csv("phenolics_descriptors.csv", index=False)


In [14]:
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, Crippen, Lipinski

def topo_descriptors(mol):
    return {
        "MolWt": Descriptors.MolWt(mol),
        "HeavyAtomCount": mol.GetNumHeavyAtoms(),
        "NumAromaticRings": rdMolDescriptors.CalcNumAromaticRings(mol),
        "NumRotatableBonds": Lipinski.NumRotatableBonds(mol),
        "TPSA": rdMolDescriptors.CalcTPSA(mol),
        "MolLogP": Crippen.MolLogP(mol),
        "FractionCSP3": rdMolDescriptors.CalcFractionCSP3(mol),
        "Chi0": Descriptors.Chi0(mol),
        "Chi1": Descriptors.Chi1(mol),
        "Chi0v": Descriptors.Chi0v(mol),
        "Chi1v": Descriptors.Chi1v(mol),
        "Kappa1": Descriptors.Kappa1(mol),
        "Kappa2": Descriptors.Kappa2(mol),
        "Kappa3": Descriptors.Kappa3(mol),
        "BalabanJ": Descriptors.BalabanJ(mol),
        "BertzCT": Descriptors.BertzCT(mol),
        "HallKierAlpha": Descriptors.HallKierAlpha(mol),
    }


In [15]:
desc_rows = []

for smi in df["SMILES"]:
    mol = Chem.MolFromSmiles(smi)
    desc_rows.append(topo_descriptors(mol) if mol else None)

topo_df = pd.DataFrame(desc_rows)
final_df = pd.concat([df, topo_df], axis=1)
final_df.to_csv("phenolics_full_descriptors.csv", index=False)  

In [16]:
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors, Crippen, Lipinski

def topo_descriptors(mol):
    ring_info = mol.GetRingInfo()

    return {
        # Size & complexity
        "MolWt": Descriptors.MolWt(mol),
        "HeavyAtomCount": mol.GetNumHeavyAtoms(),
        "RingCount": ring_info.NumRings(),

        # Ring topology
        "NumAromaticRings": rdMolDescriptors.CalcNumAromaticRings(mol),
        "NumAliphaticRings": rdMolDescriptors.CalcNumAliphaticRings(mol),

        # Flexibility & polarity
        "NumRotatableBonds": Lipinski.NumRotatableBonds(mol),
        "TPSA": rdMolDescriptors.CalcTPSA(mol),

        # Lipophilicity
        "MolLogP": Crippen.MolLogP(mol),

        # Shape
        "FractionCSP3": rdMolDescriptors.CalcFractionCSP3(mol),

        # Connectivity
        "Chi0": Descriptors.Chi0(mol),
        "Chi1": Descriptors.Chi1(mol),
        "Chi0v": Descriptors.Chi0v(mol),
        "Chi1v": Descriptors.Chi1v(mol),

        # Shape indices
        "Kappa1": Descriptors.Kappa1(mol),
        "Kappa2": Descriptors.Kappa2(mol),
        "Kappa3": Descriptors.Kappa3(mol),

        # Graph complexity
        "BalabanJ": Descriptors.BalabanJ(mol),
        "BertzCT": Descriptors.BertzCT(mol),
        "HallKierAlpha": Descriptors.HallKierAlpha(mol),
    }


In [17]:
desc_rows = []

for smi in df["SMILES"]:
    mol = Chem.MolFromSmiles(smi)
    if mol:
        desc_rows.append(topo_descriptors(mol))
    else:
        desc_rows.append({})

topo_df = pd.DataFrame(desc_rows)
df_final = pd.concat([df, topo_df], axis=1)
df_final.to_csv("phenolics_full_descriptors.csv", index=False)