# Processing the output into one datafile

The function below was used to produce `.json` files for each of the systems we have studied. It can take a bit of time to run, as it needs to parse all the `OUTCAR`'s of the various calculations. Hence, directly loading the data from a `.json` file is faster, and that is used in [the parallel_analysis.ipynb notebook](parallel_analysis.ipynb).

In [1]:
import os, json
import numpy as np

from monty.re import regrep
from monty.json import MontyEncoder
from pymatgen import Structure
from pymatgen.io.vasp.inputs import Incar
from ipywidgets import interact, fixed, FloatSlider, Text

data_dir = "/mnt/data/mbercx/"

def process_parallel(data_dir, output_file=None, verbose=False):
    timing_list = []

    cores_pattern = r"\s+running\son\s+(\S+)\stotal\scores"
    nkp_pattern = r"k-points\s+NKPTS\s=\s+([0-9]+)\s+.*"
    nbands_pattern = r".*NBANDS=\s+([0-9]+)"
    loop_pattern = r"\s+LOOP:\s+cpu\stime.+:\sreal\stime(.+)"
    
    n_kpoints = 0
    nbands = 0
    structure = None

    for nodes_dir in os.listdir(data_dir):

        nodes = int(nodes_dir.strip("nodes"))

        for kpar_dir in os.listdir(os.path.join(data_dir, nodes_dir)):

            kpar = int(kpar_dir.strip("kpar"))

            for npar_dir in os.listdir(os.path.join(data_dir, nodes_dir, kpar_dir)):

                npar = int(npar_dir.strip("npar"))

                nelmdl = np.abs(Incar.from_file(
                    os.path.join(data_dir, nodes_dir, kpar_dir, npar_dir, "INCAR")
                ).get("NELMDL", 5))
                
                outcar_file = os.path.join(
                    data_dir, nodes_dir, kpar_dir, npar_dir, "OUTCAR"
                )
                
                if n_kpoints == 0:
                    n_kpoints = int(regrep(
                        outcar_file, {"nkp": nkp_pattern})["nkp"][0][0][0])
                    nbands = int(regrep(
                        outcar_file, {"nbands": nbands_pattern})["nbands"][0][0][0])
                    structure = Structure.from_file(
                        os.path.join(data_dir, nodes_dir, kpar_dir, npar_dir, "POSCAR")
                    )

                try:
                    loop_timing = regrep(
                        filename=outcar_file,
                        patterns={"loop": loop_pattern})["loop"]

                    if len(loop_timing) > nelmdl:
                        average_loop = np.mean([float(e[0][0]) for e in loop_timing][nelmdl:])
                        total_cores = int(regrep(
                            filename=outcar_file,
                            patterns={"cores": cores_pattern})["cores"][0][0][0])
                        ncore = total_cores // kpar // npar

                        timing_list.append(
                            {"nodes": nodes, "kpar": kpar, "ncore": ncore,
                             "npar": npar, "timing": average_loop}
                        )
                    else:
                        if verbose:
                            print(str(nodes) + " " + str(npar) + " "  
                                  + str(kpar) + " only has " 
                                  + str(len(loop_timing)) + " timesteps.")
                        
                except FileNotFoundError:
                    print("No OUTCAR file found for : " + str(nodes) + "nodes"
                          + " " + str(kpar) + "kpar" + " "  + str(npar) + "npar")

    data = {
        "structure": structure.as_dict(),
        "nbands": nbands,
        "n_kpoints": n_kpoints,
        "timing_list": timing_list
    }
    
    if output_file is None:
        output_file = data_dir.split("/")[4]
        output_file += "_" + data_dir.split("/")[5]
        output_file += "_" + str(structure.composition).replace(" ", "")
        output_file += "_B" + str(nbands)
        output_file += "_K" + str(n_kpoints)
        output_file += ".json"
        output_file = os.path.join("data", output_file)
    
    with open(output_file, "w") as file:
        file.write(json.dumps(data, cls=MontyEncoder))
        
    return data

The cells below process each process one set of data into a corresponding json file and save it to the `data` directory.

**Note: The notebook does not have access any of the `data_dir`'s used below when running on Binder. It it simply added to the repository for completeness.**

### Leibniz

In [26]:
for d1 in os.listdir(os.path.join(data_dir, "leibniz")):
    for d2 in os.listdir(os.path.join(data_dir, "leibniz", d1)):
        process_parallel(os.path.join(data_dir, "leibniz", d1, d2))

### Breniac

In [2]:
for d1 in os.listdir(os.path.join(data_dir, "breniac")):
    for d2 in os.listdir(os.path.join(data_dir, "breniac", d1)):
        process_parallel(os.path.join(data_dir, "breniac", d1, d2))

No OUTCAR file found for : 16nodes 2kpar 7npar


## Appendix A: Sanity check

In [26]:
def check_timing(data_dir, timing):
    
    loop_pattern = r"\s+LOOP:\s+cpu\stime.+:\sreal\stime(.+)"
    cores_pattern = r"\s+running\son\s+(\S+)\stotal\scores"
    
    outcar_file = os.path.join(
        data_dir, str(timing["nodes"]) + "nodes",
        str(timing["kpar"]) + "kpar",
        str(timing["npar"]) + "npar",
        "OUTCAR"
    )
    incar = Incar.from_file(os.path.join(
        data_dir, str(timing["nodes"]) + "nodes",
        str(timing["kpar"]) + "kpar",
        str(timing["npar"]) + "npar",
        "INCAR"
    ))
    nelmdl = abs(incar.get("NELMDL", -5))
    loop = regrep(outcar_file, {"loop": loop_pattern})["loop"]
    loop = np.array([float(l[0][0]) for l in loop[nelmdl:]])
    cores = regrep(outcar_file, {"cores": cores_pattern})["cores"]
    cores = int(cores[0][0][0])
    
    time_check = np.mean(loop) - timing["timing"] < 1e-4
    cores_check = cores == timing["ncore"]*timing["npar"]*timing["kpar"]
    
    return time_check, cores_check 

In [27]:
for timing in data["timing_list"]:
    if not all(check_timing(data_dir=data_dir, timing=timing)):
        print(timing)