# extract information from `.out` files



In [2]:
# set project root
import os
import sys
import rootutils

rootutils.set_root(
    path=rootutils.find_root(search_from='.', indicator='.env'),
    project_root_env_var=True,  # set PROJECT_ROOT env var
    dotenv=True,  # load dotenv file
    cwd=True,  # change working directory
    pythonpath=False,  # add project root to PYTHONPATH
)

PROJECT_ROOT = os.environ["PROJECT_ROOT"]
os.chdir(PROJECT_ROOT)
PROJECT_ROOT

'/home/louzekun/orca_cal/qm7_ccsd_t_new'

In [3]:
import re
import csv
import pandas
from tqdm import tqdm


I use github copilot, to just paste the output example in the cell and let the copilot do the rest.


In [None]:
"""
infomation to extract:
raw:
```
Number of atoms                             ...      9
Number of basis functions                   ...    192
Number of shells                            ...     84
Maximum angular momentum                    ...      3
Triples Correction (T)                     ...     -0.013072999
Final correlation energy                   ...     -0.414534966
E(CCSD)                                    ...    -79.660833025
E(CCSD(T))                                 ...    -79.673906023
FINAL SINGLE POINT ENERGY      -111.238840552281
TOTAL RUN TIME: 0 days 0 hours 1 minutes 8 seconds 617 msec
```
"""
# for each line, write the regex to extract the value
out_regex = {
    "Number of atoms": r"Number of atoms\s+\.\.\.\s+(\d+)",
    "Number of basis functions": r"Number of basis functions\s+\.\.\.\s+(\d+)",
    "Number of shells": r"Number of shells\s+\.\.\.\s+(\d+)",
    "Triples Correction (T)": r"Triples Correction \(T\)\s+\.\.\.\s+(-?\d+\.\d+)",
    "Final correlation energy": r"Final correlation energy\s+\.\.\.\s+(-?\d+\.\d+)",
    "E(CCSD)": r"E\(CCSD\)\s+\.\.\.\s+(-?\d+\.\d+)",
    # "E(CCSD(T))": r"E\(CCSD\(T\)\)\s+\.\.\.\s+(-?\d+\.\d+)",  # same as final spe
    "Maximum angular momentum": r"Maximum angular momentum\s+\.\.\.\s+(\d+)",
    "FINAL SINGLE POINT ENERGY": r"FINAL SINGLE POINT ENERGY\s+(-?\d+\.\d+)",
    "TOTAL RUN TIME": r"TOTAL RUN TIME: (\d+ days \d+ hours \d+ minutes \d+ seconds \d+ msec)",
}


In [None]:
out_dpath = os.path.join(PROJECT_ROOT, "out")

# iterate over the out files, find all the regex_str, and write to a csv file
out_each_dpaths = [os.path.join(out_dpath, d) for d in os.listdir(out_dpath)]
out_fpaths = []
for out_each_dpath in tqdm(out_each_dpaths):
    if not "finished" in os.listdir(out_each_dpath):
        continue  # not finished
    # the out file is a file ends with .out
    out_file = [f for f in os.listdir(out_each_dpath) if f.endswith(".out")]
    assert len(out_file) == 1
    out_file = out_file[0]
    out_fpaths.append(os.path.join(out_each_dpath, out_file))

len(out_fpaths), out_fpaths[0:3]

In [None]:
def extract_info(out_file):
    with open(out_file, "r") as f:
        out_str = f.read()
    # get out_file file name
    out_fname = os.path.basename(out_file)
    name = out_fname[:-4]  # remove .out
    info = {"name": name}
    for key, regex in out_regex.items():
        match = re.search(regex, out_str)
        if match is None:
            print("Error: {} not found in {}".format(key, out_file))
            return None
        info[key] = match.group(1)
    # for total run time, convert into seconds
    regex = r"(\d+) days (\d+) hours (\d+) minutes (\d+) seconds (\d+) msec"
    match = re.search(regex, info["TOTAL RUN TIME"])
    convert_cum = [24*3600, 3600, 60, 1, 1e-3]
    info["runtime"] = sum([int(match.group(i+1))*convert_cum[i] for i in range(5)])
    return info

def write_csv(out_files, csv_file):
    with open(csv_file, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["out_file"] + list(out_regex.keys()))
        for out_file in out_files:
            info = extract_info(out_file)
            if info is None:
                continue
            writer.writerow([out_file] + [info[key] for key in out_regex.keys()])



out_infos = []
for out_fpath in tqdm(out_fpaths):
    info = extract_info(out_fpath)
    if info is None:
        continue
    out_infos.append(info)

# convert into pandas df
df = pandas.DataFrame(out_infos)
# remove total run time
df = df.drop(columns=["TOTAL RUN TIME"])
# sort by name
df = df.sort_values(by=["name"])
# turn colname to lower case
df.columns = [col.lower() for col in df.columns]

df.head()


In [None]:

# save to csv and pkl
df.to_csv(os.path.join(PROJECT_ROOT, "notebooks", "orca_out.csv"), index=False)
df.to_pickle(os.path.join(PROJECT_ROOT, "notebooks", "orca_out.pkl"))
