In [24]:
import subprocess
import os
import shlex
import logging
import shutil
import re
from base64 import b64decode, b64encode
import json

from copy import deepcopy

from lxml.etree import tostring
from lxml.html import soupparser

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)

In [25]:
def run_command(command):
    log.debug(f"Command: {command}")
    command = shlex.split(command)
    result = subprocess.run(command, capture_output=True, check=True)
    if result.stdout:
        log.debug(f"Command stdout: {result.stdout.decode('utf-8')}")
    if result.stderr:
        log.debug(f"Command stderr: {result.stderr.decode('utf-8')}")
    return result

# TODO: need better approach than changing dirs like this in case of failure? Maybe not an issue with scripts...
def generate_help_files(work_dir, def_files, versions, database_dir):
    # Minimal files needed for helpdoc to work
    helpdoc_files = ['dev-tools/helpdoc', 'dev-tools/helpdoc.d', 'dev-tools/helpdoc.schema', 'dev-tools/input_xx.xsl', 'GUI/Guib/lib']

    # Create work directory and go there
    root = os.getcwd()
    work_dir = os.path.join(root, work_dir)
    # TODO: this is temporary
    if os.path.exists(work_dir):
        shutil.rmtree(work_dir)
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    os.chdir(work_dir)

    # Commands to set up minimal helpdoc environment
    qe_dir = os.path.join(work_dir, 'q-e')
    cmd_clone = "git clone --filter=blob:none --sparse https://gitlab.com/QEF/q-e.git"
    run_command(cmd_clone)
    os.chdir(qe_dir)
    cmd_fetch_tags = "git fetch --all --tags"
    run_command(cmd_fetch_tags)

    cmd_checkout_files = ["git sparse-checkout add --skip-checks"]
    cmd_checkout_files = " ".join(cmd_checkout_files + helpdoc_files + def_files)
    run_command(cmd_checkout_files)

    # Commands for picking the right versions
    devtools_dir = os.path.join(qe_dir, 'dev-tools')
    if not isinstance(versions, list):
        versions = [versions]
    for v in versions:
        tag = v
        tag += "MaX" if v in ("6.3", "6.5") else ""
        tag += "MaX-Release" if v == "6.7" else ""
        cmd_checkout_tag = f"git checkout tags/qe-{tag} -b qe-{tag} --force"
        run_command(cmd_checkout_tag)
        database_dir = os.path.join(root, database_dir, 'qe-'+v)
        if not os.path.exists(database_dir):
            os.makedirs(database_dir)

        files = [os.path.join(qe_dir, def_file) for def_file in def_files]
        for def_file in files:
            dir = os.path.dirname(def_file)
            cmd_link_xsl = f"ln -sf {devtools_dir}/input_xx.xsl {dir}/input_xx.xsl"
            run_command(cmd_link_xsl)
            cmd_helpdoc = f"{devtools_dir}/helpdoc --version {v} {def_file}"
            run_command(cmd_helpdoc)

            # Copy the generated files to the database directory using os module
            xml_file = os.path.splitext(def_file)[0] + '.xml'
            html_file = os.path.splitext(def_file)[0] + '.html'
            # Explicit destination is needed to overwrite existing files
            shutil.move(html_file, os.path.join(database_dir, os.path.basename(html_file)))
            shutil.move(xml_file, os.path.join(database_dir, os.path.basename(xml_file)))

    os.chdir(root)


In [26]:
import xml.etree.ElementTree as ET
from io import StringIO

def parse_vargroup(vg, parent):
    vars = []
    type = vg.attrib['type']
    info = vg.find('info')
    if info is not None:
        info = info.text
    
    for v in vg.findall('var'):
        v_dict = {
            'name': v.attrib['name'],
            'parent': parent,
            'type': type,
            'info': info,
            'dimension': 1,
            'default': ' ',
            'options': {},
        }
        vars.append(v_dict)

    return vars

def parse_var(v, parent):
    opts = v.find('options')

    # Deal with Info
    info = None
    if opts is not None:
        info = opts.find('info')
    else:
        info = v.find('info')
    if info is not None:
        info = info.text
    else:
        info = ''
    
    options = {}
    if opts is not None:
        for o in opts.findall('opt'):
            options.update({o.attrib['val']: o.text})
    
    default = v.find('default')
    if default is not None:
        default = default.text
    else:
        default = ''

    v_dict = {
        'name': v.attrib['name'],
        'parent': parent,
        'type': v.attrib.get('type', "UNKNOWN"),
        'dimension': v.attrib.get('end', 1),
        'info': info,
        'default': default,
        'options': options,
    }


    return v_dict

def parse_group(g, parent):
    vars = []
    for e in g:
        if e.tag in ('var', 'multidimension', 'dimension'):
            vars.append(parse_var(e, parent))
        elif e.tag == 'vargroup':
            vars.extend(parse_vargroup(e, parent))
        elif e.tag == 'group':
            vars.extend(parse_group(e, parent)) 
    return vars

type_map = {
    "character": "str",
    "real": "float",
    "integer": "int",
    "logical": "bool",
    "unknown": "unknown"
}

def tidy_dict(d):
    tidy_d = {}
    for k, v in d.items():
        if isinstance(k, str):
            k = tidy_str(k)
        if v and isinstance(v, str):
            v = tidy_str(v)
        tidy_d[k] = v
    return tidy_d

def tidy_str(s):
    s = s.replace('\n', ' ').strip()
    if s and s[0] == "'" and s[-1] == "'":
        s = s[1:-1]
    return s

def tidy_vars(vars):
    clean_vars = []
    for v in vars:
        name = v["name"]
        parent = v["parent"].lower()
        type = type_map[v["type"].lower()]
        dimension = v["dimension"]
        options = tidy_dict(v["options"])
        default = tidy_str(v["default"])
        info = tidy_str(v["info"])

        # Special cases
        if name == 'A':
            info = 'a in ANGSTROM'
        elif name == 'B':
            info = 'b in ANGSTROM'
        elif name == 'C':
            info = 'c in ANGSTROM'
        elif name == 'cosAB':
            info = 'cos angle between a and b (gamma)'
        elif name == 'cosAB':
            info = 'cos angle  between a and c (beta)'
        elif name == 'cosBC':
            info = 'cos angle between b and c (alpha)'
        elif name == 'ibrav':
            info = 'Bravais lattice choice'
            options = {
                0: "Lattice in CELL_PARAMETERS",
                1: "Cubic P (sc) lattice",
                2: "Cubic F (fcc) lattice",
                3: "Cubic I (bcc) lattice",
                -3: "Cubic I (bcc) lattice",
                4: "Hexagonal and Trigonal P lattice",
                5: "Trigonal Rhombohedral lattice, 3-fold axis c",
                -5: "Trigonal Rhombohedral lattice, 3-fold axis <111>",
                6: "Tetragonal P (st) lattice",
                7: "Tetragonal I (bct) lattice",
                8: "Orthorhombic P lattice",
                9: "Orthorhombic base-centered(bco) lattice",
                -9: "Orthorhombic base-centered(bco) lattice",
                91: "Orthorhombic one-face base-centered A-type lattice",
                10: "Orthorhombic face-centered lattice",
                11: "Orthorhombic body-centered lattice",
                12: "Monoclinic P, unique axis c lattice",
                -12: "Monoclinic P, unique axis b lattice",
                13: "Monoclinic base-centered lattice",
                -13: "Monoclinic base-centered lattice",
                14: "Triclinic lattice",
            }
        if type == "bool" and options == {}:
            options = {
                True: "",
                False: "",
            }
        
        clean_vars.append({
            "name": name,
            "parent": parent,
            "type": type,
            "dimension": dimension,
            "options": options,
            "default": default,
            "info": info,
        })
    
    return clean_vars

def extract_vars(xml_filename):
    pattern = re.compile(r'<a href="(.*?)">\s*(.*?)\s*</a>')
    with open(xml_filename, 'r') as f:
        xmltext = f.read()
        xmltext = xmltext.replace("<ref>", "")
        xmltext = xmltext.replace("</ref>", "")
        xmltext = xmltext.replace("<b>", "")
        xmltext = xmltext.replace("</b>", "")
        xmltext = pattern.sub(r'\2 (\1)', xmltext)
        root = ET.parse(StringIO(xmltext)).getroot()

    vars = []
    #cards = []
    for child in root:
        if child.tag == 'namelist':
            namelist_name = child.attrib['name']
            for e in child:
                if e.tag in ('var', 'multidimension', 'dimension'):
                    vars.append(parse_var(e, namelist_name))
                elif e.tag == 'vargroup':
                    vars.extend(parse_vargroup(e, namelist_name))
                elif e.tag == 'group':
                    vars.extend(parse_group(e, namelist_name))
        #elif child.tag == 'card':
        #    cards.append(child)
        
    vars = tidy_vars(vars)

    return vars

In [27]:
# Generates a map from name -> {idm, html}
def gen_idm_map(soup):
    idm_map = {}
    # Find all links with href = "#idm*", their text is the name
    links = soup.xpath('//a[starts-with(@href, "#idm")]')
    for a in links:
        # The split accounts for some array edge cases in old documentation
        name = a.text.split('(')[0] 
        if name.startswith('&'):
            name = name[1:]
        idm = a.attrib['href'][1:]
        idm_map.update({name: {"idm": idm, "html": ""}})
    # Find all a tags with name="name", the table is an ancestor
    for name, idm_dict in idm_map.items():
        tags = soup.xpath(f'//a[@name="{name}"]')
        for a in tags:
            html = None
            # This accounts for most cases
            for sibling in a.itersiblings():
                if sibling.tag == 'table':
                    html = sibling
                    break
            # This accounts for stuff in groups
            if html is None:
                for parent in a.iterancestors():
                    if parent.tag == 'table':
                        html = parent
                        break
            idm_dict["html"] = html

    return idm_map

In [28]:
def wipe_style(html):
    to_delete = ["style", "align", "valign", "width"]
    for attr in to_delete:
        if attr in html.attrib:
            del html.attrib[attr]


def generate_webpage(html):
    if isinstance(html, str):
        print(html)
    if html.tag == "table":
        html.classes.add("tag-table")
        wipe_style(html)
        tags = html.xpath(f"//th")
        for tag in tags:
            tag.classes.add("header-cell")
            wipe_style(tag)
        tags = html.xpath(f"//td")
        for child in tags:
            if "style" in child.attrib:
                style = child.attrib["style"].split(";")
                style = [s for s in style if s.strip()]
                style = dict(s.split(":") for s in style)
                style = {k.strip(): v.strip() for k, v in style.items()}
                if "background" in style and "text-align" in style:
                    bgcol = style["background"]
                    align = style["text-align"]
                    if bgcol == "#ffffc3" and align == "left":
                        child.classes.add("type-cell")
                    elif bgcol == "#ffffc3" and align == "right":
                        child.classes.add("datalabel-cell")
                    elif bgcol == "#fff3d9" and align == "left":
                        child.classes.add("data-cell")
            elif "colspan" in child.attrib and child.attrib["colspan"] == "2":
                child.classes.add("description-cell")
            wipe_style(child)
        tags = html.xpath(f"//pre")
        for child in tags:
            # child.string = tidy_str(child.text)
            # print('found pre')
            # Find if it has a <a> tag with href = "#*", if so, replace with <tt>
            child.tag = "p"
            links = child.xpath('//a[starts-with(@href, "#")]')
            # Wrap the text in <tt> tags before and </tt> after
            for a in links:
                a.classes.add("tag-link")
                # TODO: change where href points to
                # Should be like database_dir + webpage_dir + a.attribs['href'][1:]
                a.attrib['href'] = a.attrib['href'][1:] + '.html'

            # TODO: for ibrav, create a clean string and
            # use that instead of the cleaning preocedure 
            # You should hash the string in the documentation 
            # And if it doesn't match use the unforomatted one from documentation
            wipe_style(child)
            string = tostring(child, encoding="unicode")
            string = tidy_str(string)
            string = re.sub(r"\.TRUE\.", r"<tt>.TRUE.</tt>", string)
            string = re.sub(r"\.FALSE\.", r"<tt>.FALSE.</tt>", string)
            new_element = soupparser.fromstring(string)
            child.getparent().replace(child, new_element)

        webpage_template = """
        <!DOCTYPE html>
        <html>
        <head>
            <link rel="stylesheet" type="text/css" href="../qe-tag.css">
        </head>
        <body>
        </body>
        </html>
        """
        webpage = soupparser.fromstring(webpage_template)
        body = webpage.xpath("//body")[0]
        body.append(html)


        return tostring(webpage, encoding="unicode", pretty_print=True)


def add_html_info(vars, html_filename):
    with open(html_filename, "r") as f:
        soup = soupparser.fromstring(f.read())

    idm_map = gen_idm_map(soup)
    #link_base = "file:///" + os.path.join(os.getcwd(), html_filename)
    for v in vars:
        name = v["name"]
        if name in idm_map:
            v["idm"] = idm_map[name]["idm"]
            if idm_map[name]["html"] is not None:
                # Screws up formatting for ibrav
                webpage = generate_webpage(idm_map[name]["html"])
                v["html"] = b64encode(webpage.encode("utf-8")).decode("utf-8")
            else:
                v["html"] = ""
            #v["link"] = link_base + "#" + idm_map[name]["idm"]
        else:
            print(f"WARNING: No HTML info for {name}")
    return vars

In [29]:
def generate_database(version, database_dir):
    if isinstance(version, str):
        version = [version]
    for ver in version:
        xml_filename = os.path.join(database_dir, 'qe-'+ver, "INPUT_PW.xml")
        vars = extract_vars(xml_filename)
        html_filename = os.path.join(database_dir, 'qe-'+ver, "INPUT_PW.html")
        vars = add_html_info(vars, html_filename)
        parents = list(set([v["parent"] for v in vars]))
        vars_dict = {p: {} for p in parents}
        for v in vars:
            vars_dict[v["parent"]].update({v["name"]: v})
        json_filename = os.path.join(database_dir, 'qe-'+ver, "database.json")
        with open(json_filename, "w") as f:
            json.dump(vars_dict, f, indent=4)

In [30]:
work_dir = 'work'
def_files = ['PW/Doc/INPUT_PW.def', 'PP/Doc/INPUT_PROJWFC.def']
versions = ['6.3', '6.8', '7.0', '7.2']
database_dir = 'database/'
gen_docs = False

if gen_docs:
    generate_help_files(work_dir, def_files, versions, database_dir)
generate_database(versions, database_dir)

