In [113]:
import subprocess
import os
import shlex
import logging
import shutil
import re

from bs4 import BeautifulSoup

log = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)

WORK_DIR = 'work'
HELPDOC_FILES = ['dev-tools/helpdoc', 'dev-tools/helpdoc.d', 'dev-tools/helpdoc.schema', 'dev-tools/input_xx.xsl', 'GUI/Guib/lib']
DEFS_TO_PARSE = ['PW/Doc/INPUT_PW.def', 'PP/Doc/INPUT_PROJWFC.def']
VERSION = ['6.3', '6.8', '7.0', '7.2']
DATABSE_DIR = 'database/espresso'
gen_docs = False

In [114]:
def run_command(command):
    log.debug(f"Command: {command}")
    command = shlex.split(command)
    result = subprocess.run(command, capture_output=True, check=True)
    if result.stdout:
        log.debug(f"Command stdout: {result.stdout.decode('utf-8')}")
    if result.stderr:
        log.debug(f"Command stderr: {result.stderr.decode('utf-8')}")
    return result

def generate_help_files(WORK_DIR, HELPDOC_FILES, DEFS_TO_PARSE, VERSION, DATABSE_DIR):
    # Create work directory and go there
    root = os.getcwd()
    work_dir = os.path.join(root, WORK_DIR)
    # TODO: this is temporary
    if os.path.exists(work_dir):
        shutil.rmtree(work_dir)
    if not os.path.exists(work_dir):
        os.makedirs(work_dir)
    os.chdir(work_dir)

    # Commands to set up minimal helpdoc environment
    qe_dir = os.path.join(work_dir, 'q-e')
    cmd_clone = "git clone --filter=blob:none --sparse https://gitlab.com/QEF/q-e.git"
    run_command(cmd_clone)
    os.chdir(qe_dir)
    cmd_fetch_tags = "git fetch --all --tags"
    run_command(cmd_fetch_tags)

    cmd_checkout_files = ["git sparse-checkout add"]
    cmd_checkout_files = " ".join(cmd_checkout_files + HELPDOC_FILES + DEFS_TO_PARSE)
    run_command(cmd_checkout_files)

    # Commands for picking the right version
    devtools_dir = os.path.join(qe_dir, 'dev-tools')
    for v in VERSION:
        tag = v
        tag += "MaX" if v in ("6.3", "6.5") else ""
        tag += "MaX-Release" if v == "6.7" else ""
        cmd_checkout_tag = f"git checkout tags/qe-{tag} -b qe-{tag}"
        run_command(cmd_checkout_tag)
        database_dir = os.path.join(root, DATABSE_DIR, v)
        if not os.path.exists(database_dir):
            os.makedirs(database_dir)

        files = [os.path.join(qe_dir, def_file) for def_file in DEFS_TO_PARSE]
        for def_file in files:
            dir = os.path.dirname(def_file)
            cmd_link_xsl = f"ln -sf {devtools_dir}/input_xx.xsl {dir}/input_xx.xsl"
            run_command(cmd_link_xsl)
            cmd_helpdoc = f"{devtools_dir}/helpdoc --version {v} {def_file}"
            run_command(cmd_helpdoc)

            # Copy the generated files to the database directory using os module
            xml_file = os.path.splitext(def_file)[0] + '.xml'
            html_file = os.path.splitext(def_file)[0] + '.html'
            # Explicit destination is needed to overwrite existing files
            shutil.move(html_file, os.path.join(database_dir, os.path.basename(html_file)))
            shutil.move(xml_file, os.path.join(database_dir, os.path.basename(xml_file)))

    os.chdir(root)

if gen_docs:
    generate_help_files(WORK_DIR, HELPDOC_FILES, DEFS_TO_PARSE, VERSION, DATABSE_DIR)

In [115]:
#import xmltodict
#import re
#
#def parse_vargroup(vargroup):
#    vars = []
#    info = vargroup['info']
#    type = vargroup['@type']
#    for v in vargroup['var']:
#        name = v['@name']
#        vars.append({
#            '@name': name,
#            '@type': type,
#            'info': info,
#        })
#    return vars 
#
#def find_all_vars(namelist):
#    vars = []
#    if 'var' in namelist.keys():
#        vars.extend(namelist['var'])
#    if 'multidimension' in namelist.keys():
#        vars.extend(namelist['multidimension'])
#    if 'dimension' in namelist.keys():
#        vars.extend(namelist['dimension'])
#    if 'vargroup' in namelist.keys():
#        vargroups = namelist['vargroup']
#        if isinstance(vargroups, dict):
#            vargroups = [vargroups]
#        for vg in vargroups:
#            vars.extend(parse_vargroup(vg))
#    if 'group' in namelist.keys():
#        groups = namelist['group']
#        if isinstance(groups, dict):
#            groups = [groups]
#        for g in groups:
#            vars.extend(find_all_vars(g))
#    return vars
#
#pattern = re.compile(r'<a href="(.*?)">\s*(.*?)\s*</a>')
#with open(os.path.join(DATABSE_DIR, '7.2', 'INPUT_PW.xml'), 'r') as f:
#    xmltext = f.read()
#    xmltext = xmltext.replace("<ref>", "")
#    xmltext = xmltext.replace("</ref>", "")
#    xmltext = pattern.sub(r'\2 (\1)', xmltext)
#    doc = xmltodict.parse(xmltext)
#
#names = []
#vars = []
#for namelist in doc['input_description']['namelist']:
#    names.append(namelist['@name'])
#    vars.extend(find_all_vars(namelist))
#
#print(len(vars))

#nls = doc['input_description']['namelist']
#nls[1]['group'][0]
#for nl in nls:
#    if 'group' in nl.keys():
#        print(f'Found group in {nl["@name"]} with length {len(nl["group"])}')
#        if isinstance(nl['group'], dict):
#            nl['group'] = [nl['group']]
#        for g_i, g in enumerate(nl['group']):
#            if 'vargroup' in g.keys():
#                print(f'Found vargroup in {nl["@name"]} group {g_i} with length {len(g["vargroup"])}')
#                if isinstance(g['vargroup'], dict):
#                    g['vargroup'] = [g['vargroup']]
#                for vg_i, vg in enumerate(g['vargroup']):
#                    print(f'{nl["@name"]} group {g_i} vargroup {vg_i} has length {len(vg["var"])}')
#        #if isinstance(nl['group'], dict):
#        #    nl['group'] = [nl['group']]
#        #for g in nl['group']:
#        #    if 'vargroup' in g.keys():
#        #        if isinstance(g['vargroup'], dict):
#        #            g['vargroup'] = [g['vargroup']]
#        #        for vg in g['vargroup']:
#        #            print(vg)
#        #print(nl['group'])
#    if 'vargroup' in nl.keys():
#        print(f'Found vargroup in {nl["@name"]} with length {len(nl["vargroup"])}')
#        if isinstance(nl['vargroup'], dict):
#            nl['vargroup'] = [nl['vargroup']]
#        for vg_i, vg in enumerate(nl['vargroup']):
#            print(f'{nl["@name"]} vargroup {vg_i} has length {len(vg["var"])}')
#        #if isinstance(nl['vargroup'], dict):
#        #    nl['vargroup'] = [nl['vargroup']]
#        #for vg in nl['vargroup']:
#        #    print(vg)
#


In [116]:
import xml.etree.ElementTree as ET
from io import StringIO

def parse_vargroup(vg, parent):
    vars = []
    type = vg.attrib['type']
    info = vg.find('info')
    if info is not None:
        info = info.text
    
    for v in vg.findall('var'):
        v_dict = {
            'name': v.attrib['name'],
            'parent': parent,
            'type': type,
            'info': info,
            'dimension': 1,
            'default': ' ',
            'options': {},
        }
        vars.append(v_dict)

    return vars

def parse_var(v, parent):
    opts = v.find('options')

    # Deal with Info
    info = None
    if opts is not None:
        info = opts.find('info')
    else:
        info = v.find('info')
    if info is not None:
        info = info.text
    else:
        info = ''
    
    options = {}
    if opts is not None:
        for o in opts.findall('opt'):
            options.update({o.attrib['val']: o.text})
    
    default = v.find('default')
    if default is not None:
        default = default.text
    else:
        default = ''

    v_dict = {
        'name': v.attrib['name'],
        'parent': parent,
        'type': v.attrib['type'],
        'dimension': v.attrib.get('end', 1),
        'info': info,
        'default': default,
        'options': options,
    }

    return v_dict

def parse_group(g, parent):
    vars = []
    for e in g:
        if e.tag in ('var', 'multidimension', 'dimension'):
            vars.append(parse_var(e, parent))
        elif e.tag == 'vargroup':
            vars.extend(parse_vargroup(e, parent))
        elif e.tag == 'group':
            vars.extend(parse_group(e, parent)) 
    return vars

type_map = {
    "character": str,
    "real": float,
    "integer": int,
    "logical": bool,
}

def tidy_dict(d):
    tidy_d = {}
    for k, v in d.items():
        if isinstance(k, str):
            k = tidy_str(k)
        if v and isinstance(v, str):
            v = tidy_str(v)
        tidy_d[k] = v
    return tidy_d

def tidy_str(s):
    s = s.replace('\n', ' ').strip()
    # If str is already a string, e.g., str = "'hello'", clean it up
    if s and s[0] == "'" and s[-1] == "'":
        s = s[1:-1]
    return s

def tidy_vars(vars):
    clean_vars = []
    for v in vars:
        name = v["name"]
        parent = v["parent"]
        type = type_map[v["type"].lower()]
        dimension = v["dimension"]
        options = tidy_dict(v["options"])
        default = tidy_str(v["default"])
        info = tidy_str(v["info"])

        # Special cases
        if name == 'A':
            info = 'a in ANGSTROM'
        elif name == 'B':
            info = 'b in ANGSTROM'
        elif name == 'C':
            info = 'c in ANGSTROM'
        elif name == 'cosAB':
            info = 'cos angle between a and b (gamma)'
        elif name == 'cosAB':
            info = 'cos angle  between a and c (beta)'
        elif name == 'cosBC':
            info = 'cos angle between b and c (alpha)'
        elif name == 'ibrav':
            info = 'Bravais lattice choice'
            options = {
                0: "Lattice in CELL_PARAMETERS",
                1: "Cubic P (sc) lattice",
                2: "Cubic F (fcc) lattice",
                3: "Cubic I (bcc) lattice",
                -3: "Cubic I (bcc) lattice",
                4: "Hexagonal and Trigonal P lattice",
                5: "Trigonal Rhombohedral lattice, 3-fold axis c",
                -5: "Trigonal Rhombohedral lattice, 3-fold axis <111>",
                6: "Tetragonal P (st) lattice",
                7: "Tetragonal I (bct) lattice",
                8: "Orthorhombic P lattice",
                9: "Orthorhombic base-centered(bco) lattice",
                -9: "Orthorhombic base-centered(bco) lattice",
                91: "Orthorhombic one-face base-centered A-type lattice",
                10: "Orthorhombic face-centered lattice",
                11: "Orthorhombic body-centered lattice",
                12: "Monoclinic P, unique axis c lattice",
                -12: "Monoclinic P, unique axis b lattice",
                13: "Monoclinic base-centered lattice",
                -13: "Monoclinic base-centered lattice",
                14: "Triclinic lattice",
            }
        if type == bool and options == {}:
            options = {
                True: "",
                False: "",
            }
        
        clean_vars.append({
            "name": name,
            "parent": parent,
            "type": type,
            "dimension": dimension,
            "options": options,
            "default": default,
            "info": info,
        })
    
    return clean_vars

def extract_vars(xml_filename):
    pattern = re.compile(r'<a href="(.*?)">\s*(.*?)\s*</a>')
    with open(xml_filename, 'r') as f:
        xmltext = f.read()
        xmltext = xmltext.replace("<ref>", "")
        xmltext = xmltext.replace("</ref>", "")
        xmltext = xmltext.replace("<b>", "")
        xmltext = xmltext.replace("</b>", "")
        xmltext = pattern.sub(r'\2 (\1)', xmltext)
        root = ET.parse(StringIO(xmltext)).getroot()

    vars = []
    #cards = []
    for child in root:
        if child.tag == 'namelist':
            namelist_name = child.attrib['name']
            for e in child:
                if e.tag in ('var', 'multidimension', 'dimension'):
                    vars.append(parse_var(e, namelist_name))
                elif e.tag == 'vargroup':
                    vars.extend(parse_vargroup(e, namelist_name))
                elif e.tag == 'group':
                    vars.extend(parse_group(e, namelist_name))
        #elif child.tag == 'card':
        #    cards.append(child)
        
    vars = tidy_vars(vars)

    return vars

In [117]:
xml_filename = os.path.join(DATABSE_DIR, '7.2', 'INPUT_PW.xml')
vars = extract_vars(xml_filename)

In [178]:
from lxml.html import soupparser
# Generates a map from name -> {idm, html}
def gen_idm_map(soup):
    idm_map = {}
    # Find all links with href = "#idm*", their text is the name
    links = soup.xpath('//a[starts-with(@href, "#idm")]')
    for a in links:
        name = a.text
        if name.startswith('&'):
            name = name[1:]
        idm = a.attrib['href'][1:]
        idm_map.update({name: {"idm": idm, "html": ""}})
    # Find all th tags with text=name*, their grandparent is the table
    for name, idm_dict in idm_map.items():
        tags = soup.xpath(f'//th[starts-with(text(), "{name}")]')
        for th in tags:
            table = th.getparent().getparent()
            idm_dict["html"] = table

    return idm_map

In [179]:

#def clean_html(html):
#    cleaner = Cleaner(style=True, align=True, valign=True)
#    if html is not None:
#        for child in html.find_all(True):
#            # Remove all attributes
#            if 'style' in child.attrs:
#                del child[attr]
#    else:
#        html = ""
#    return html

def add_html_info(vars, html_filename):
    with open(html_filename, "r") as f:
        #soup = BeautifulSoup(f, "lxml")
        soup = soupparser.fromstring(f.read())

    idm_map = gen_idm_map(soup)
    link_base = "file:///" + os.path.join(os.getcwd(), html_filename) + "#"
    for v in vars:
        name = v["name"]
        if name in idm_map:
            v["idm"] = idm_map[name]["idm"]
            html = idm_map[name]["html"] if idm_map[name]["html"] else None
            #html = clean_html(html)
            v["html"] = html
            v["link"] = link_base + idm_map[name]["idm"]
        else:
            print(f"WARNING: No HTML info for {name}")
    return vars


html_filename = os.path.join(DATABSE_DIR, "7.2", "INPUT_PW.html")
vars = add_html_info(vars, html_filename)

  html = idm_map[name]["html"] if idm_map[name]["html"] else None


In [120]:
# Save the html of variable 20 to a file for testing
with open("dirty.html", 'w') as f:
    f.write(vars[20]['html'].prettify())

In [209]:
# Import deepcopy for me
from copy import deepcopy
html = deepcopy(vars[20]['html'])

if html.name == 'table':
    html.classes.add('tag-table')
    to_delete = ['style', 'align', 'valign', 'width', 'colspan']
    for attr in to_delete:
        if attr in html.attrib:
            del html.attrib[attr]
    for child in html.find_all(True):
        if child.name == 'th':
            child.attrs['class'] = 'header-cell'
        elif child.name == 'td':
            if 'style' in child.attrs:
                style = child['style']
                if 'background-color' in style and 'text-align' in style:
                    bgcol = style['background-color']
                    align = style['text-align']
                    if bgcol == '#ffffc3' and align == 'left':
                        child.attrs['class'] = 'type-cell'
                    elif bgcol == '#ffffc3' and align == 'right':
                        child.attrs['class'] = 'data-cell'
                    elif bgcol == '#fff3d9' and align == 'left':
                        child.attrs['class'] = 'datalabel-cell'
                elif 'colspan' in child.attrs and child['colspan'] == '2':
                    child.add_class('description-cell')
        elif child.name == 'pre':
            #child.string = tidy_str(child.text)
            #print('found pre')
            # Find if it has a <a> tag with href = "#*", if so, replace with <tt>
            for a in child.find_all('a', href=re.compile("^#")):
                a.name = 'tt'
                del a.attrs['href'] 
            # Go through the text and replace all instances of .TRUE. with <tt>.TRUE.</tt> and similarly with false
            child.string = re.sub(r'\.TRUE\.', r'<tt>.TRUE.</tt>', child.text)
        for attr in to_delete:
            if attr in child.attrs:
                del child[attr]

print(html.prettify())

AttributeError: 'HtmlElement' object has no attribute 'name'

In [204]:
html = vars[0]['html']

In [208]:
html.attrib['style']

'border-color:   #b5b500; border-style: solid; border-width: 2; margin-bottom: 10; table-layout: auto; background-color: #FFFFFF;'

In [169]:
with open(html_filename, "r") as f:
    soup = soupparser.fromstring(f)
    links = soup.xpath('//a[starts-with(@href, "#idm")]')
    for a in links:
        continue

In [171]:
link = a

'v_val(I,J)'

In [173]:
link.attrib['href']

'#idm166337907184'

In [176]:
# Find parent of link
link.getparent()

<Element i at 0x115c02530>

In [210]:
html.tag

'table'