In [76]:
#import wikitextparser as wtp
import requests

from codex.database.vasp import generate_database, refresh_database, tidy_wikicode
USER_AGENT = "dft-codex/0.0.0 (Omar A. Ashour, ashour@berkeley.edu)"
API_URL = "https://www.vasp.at/wiki/api.php"
WIKI_URL = "https://www.vasp.at/wiki/index.php"
REST_API_URL = "https://www.vasp.at/wiki/rest.php/v1/"

import logging 
import re
logging.basicConfig(level=logging.INFO)
import html
from copy import deepcopy

from lxml.etree import tostring
from lxml.html import parse, document_fromstring, fromstring

In [209]:
def get_html_render(title):
    params = {
        "title": title,
        "action": "render",
    }
    headers = {"User-Agent": USER_AGENT}
    return requests.get(WIKI_URL, headers=headers, params=params).text

import os
import time
html_dict = {}
i = 0
for tag in database['INCAR'].keys():
    i += 1
    print(tag)
    page_html = get_html_render(tag)
    time.sleep(0.5)
    html_dict[tag] = page_html
    if i >= 5:
        break

import json
with open('vasp-html-raw.json', 'w') as f:
    json.dump(html_dict, f)
#with open('temp.html', 'w') as f:
#    f.write(page_html)

PREC
ICHARG
ISTART
INIWAV
NSW


In [202]:
# Get rid of footer

def style_tag_values(a):
    """This styles tag values
    Takes an anchor element, which could be something like (in HTML):
    <a href="https://www.vasp.at/wiki/index.php/ALGO" title="ALGO">ALGO</a>=Normal bla bla bla
    And changes it to
    <a href="https://www.vasp.at/wiki/index.php/ALGO" title="ALGO">ALGO</a>=<span class="tag-value">Normal</span> bla bla bla
    by adding a new span element after it in the tree and changing its tail
    """
    match = re.match(r"\s*=\s*([^\s]+)(.*)", a.tail, re.S)
    if match:
        a.tail = "="
        index = a.getparent().index(a)
        # TODO: get rid of style
        new_element = fromstring(
            f'<span class="tag-value" style="color: red;">{match.group(1)}</span> {match.group(2)}'
        )
        a.getparent().insert(index + 1, new_element)

def clean_html(page_html):
    new_page_html = page_html.rsplit("<hr />", 1)[0]
    # Get rid of header
    new_page_html = new_page_html.rsplit("<hr />", 1)[-1]
    root = fromstring(new_page_html)
    URL_BASE = "https://www.vasp.at/wiki/index.php/"
    for a in root.xpath("//a"):
        # These are self links, so we want to make them monospace
        if a.attrib.get("class") == "mw-selflink selflink":
            a.classes.remove("mw-selflink")
            a.classes.remove("selflink")
            a.classes.add("tag-link")
            a.attrib["href"] = URL_BASE + a.text
            # Now we check if the tail has '= something' in it
            if a.tail and a.tail.startswith("="):
                style_tag_values(a)

        elif a.attrib.get("href"):
            page = a.attrib["href"].split("/")[-1]
            # This is our best guess for pages that link to files or other tags
            if page.upper() == page:
                a.classes.add("tag-link")
                if a.tail and a.tail.startswith("="):
                    style_tag_values(a)
    for table in root.xpath("//table"):
        # This is a hack to find the warning/mind/important etc tables
        if table.attrib.get("style"):
            # Not very robust if VASP change their template but not many options...
            # style = table.attrib["style"].split(";")
            # style = [s for s in style if s.strip()]
            # style = dict(s.split(":") for s in style)
            # style = {k.strip(): v.strip() for k, v in style.items()}
            # if style.get("border") and style.get("padding"):
            #    if style["border"].startswith("0px") and style["padding"] == "5px":
            alert_class = None
            for s in table.xpath(".//b//span"):
                heading = s.text
                heading_element = s.getparent()
                tail = heading_element.tail
                td = heading_element.getparent()
                if s.text.lower().startswith("important"):
                    alert_class = "alert-primary"
                elif s.text.lower().startswith("warning"):
                    alert_class = "alert-warning"
                elif s.text.lower().startswith("mind"):
                    alert_class = "alert-info"
                elif s.text.lower().startswith("tip"):
                    alert_class = "alert-success"
                elif s.text.lower().startswith("deprecated"):
                    alert_class = "alert-danger"
                else:
                    alert_class = "alert-secondary"
                children = td.getchildren()
                children.remove(heading_element)
                heading_element
                break
            new_element = fromstring(
                f'<div class="alert {alert_class}" role="alert"><h4 class="alert-heading">{heading}</h4>{tail}</div>'
            )
            new_element.extend(children)
            table.getparent().replace(table, new_element)
        else:
            table.classes.add("table")
            table.classes.add("table-striped")
            table.classes.add("table-hover")
            if "wikitable" in table.attrib.get("class"):
                table.classes.remove("wikitable")
    
    return tostring(root).decode("utf-8")

In [210]:
with open("vasp-html-raw.json", "r") as f:
    html_dict = json.load(f)
for tag, page_html in html_dict.items():
    new_page_html = clean_html(page_html)
    with open(f"temp_vasp_html/{tag}.html", "w") as f:
        f.write(new_page_html)

# HTML conversion (mwcomposerfromhell)

In [2]:
database={}
if not database:
    logging.info("Database doesn't exist. Creating it now.")
    database = generate_database()
    logging.info("Database created.")
else:
    logging.info("Database exists. Checking for updates.")
    database = refresh_database(database)

INFO:root:Database doesn't exist. Creating it now.
INFO:root:Database created.


In [44]:
def tidy_wikicode(wikicode, templates=True, formatting=True, strip=True, math=True, unescape=True, footer=False, header=False):
    wikicode = str(wikicode)

    if templates:
        pattern_tag = r"\{\{TAG\|(\w+)\}\}"
        pattern_file = r"\{\{FILE\|(\w+)\}\}"
        pattern_tagdef = r"\{\{TAGDEF\|(\w+)\}\}"
        pattern_sc = r"\{\{sc\|(.*?)\}\}"
        # TODO: some edge cases have a third | in the NB template, should discard
        # pattern_nb = r"\{\{NB\|\s*(\w+)\s*\|(.*?)\}\}"

        wikicode = re.sub(r"\s*\{\{=\}\}\s*", "=", wikicode)
        wikicode = re.sub(pattern_tag, r"<tag-ref>\1</tag-ref>", wikicode)
        wikicode = re.sub(pattern_file, r"<file-ref>\1</file-ref>", wikicode)
        wikicode = re.sub(pattern_tagdef, r"<tag-ref>\1</tag-ref>", wikicode)
        wikicode = re.sub(pattern_sc, "", wikicode)
        #wikicode = re.sub(pattern_nb, r"<div><\1>\2</\1></div>", wikicode)

    if formatting:
        pattern_bold = r"''(.*?)''"
        pattern_italics = r"'''(.*?)'''"

        wikicode = re.sub(pattern_italics, r"<b>\1</b>", wikicode)
        wikicode = re.sub(pattern_bold, r"<i>\1</i>", wikicode)

    if math:
        # TODO: next two can be combined
        wikicode = re.sub(r"\<math\>\s*10\^\{([+-]*\d+)\}\s*\<\/math\>", r"1E\1", wikicode)
        wikicode = re.sub(
            r"\<math\>\s*(\d+)\s*\\times\s*10\^\{([+-]*\d+)\}\s*\<\/math\>", r"\1E\2", wikicode
        )
        # TODO: can be comined
        wikicode = re.sub(r"\<math\>\s*(\d+.\d+)\s*<\/math\>", r"\1", wikicode)
        wikicode = re.sub(r"\<math\>\s*(\d+)\s*<\/math\>", r"\1", wikicode)

    if unescape:
        wikicode = html.unescape(wikicode)

    if strip:
        wikicode = wikicode.strip()

    if footer:
        # Works for the wikicode itself
        wikicode = wikicode.rsplit('----', 1)[0]
        # Works for the mixed wikicode/HTML
        # TODO: not robust
    if header:
        # Works for the wikicode itself
        wikicode = wikicode.rsplit('----', 1)[-1]
        # Works for the mixed wikicode/HTML
        # TODO: not robust
        wikicode = wikicode.rsplit('<hr />', 1)[-1]


    return wikicode

# Workflow
We want to parse 5 specific pages + every INCAR tag.

#### Option 1: Database exists
1. pull *only* the time stamp of the 5 pages + every INCAR tag.
2. compare time stamps to those in the database
3. update list of pages to parse, only need those that have been updated
4. parse the list of pages

#### Option 2: Database doesn't exist
1. pull the default list of pages 


In [192]:
database = {}

In [193]:
fudge_dates = True
# Change the date on 20 random entries in the database to 1980
import random
if fudge_dates and database:
    for i in range(20):
        title = random.choice(list(database['INCAR'].keys()))
        database['INCAR'][title]["last_revised"] = "1980-01-01T00:00:00Z"
        #print(f"Changed ['INCAR']['{title}'] to 1980-01-01T00:00:00Z")

In [194]:
if not database:
    logging.info("Database doesn't exist. Creating it now.")
    database = generate_database()
    logging.info("Database created.")
else:
    logging.info("Database exists. Checking for updates.")
    database = refresh_database(database)

INFO:root:Database doesn't exist. Creating it now.
INFO:root:Database created.


# Defaults, Options and Data Type

These can be obtained from `TAGDEF` and maybe `DEF` templates, depending on your luck...

The format is:

```
{{TAGDEF | TAGNAME | DATATYPE | DEFAULT}}
```

where DATATYPE is either:
```
1. integer
2. real
3. logical (also written as .TRUE. {{!}} .FALSE.)
4. (something) array (only seen real from a cursory look)
5. A list of options (e.g., 1 {{!}} 2-10 {{!}} [integer] > 0)
```

DEFAULT isn't always there, and if it is, could be a string with some more explanation or even 'not set' or '<math>10^{-4}</math>' (with the HTML tags).

the DEF template has the form:
```
{{DEF | TAGNAME | VALUE | CONDITION | VALUE | CONDITION | ....}}
```
where CONDITION is a string describing other tags (usually) and VALUE is an integer or string or whatever

lam

Example:
```
{{TAGDEF|PREC|Low {{!}} Medium {{!}} High {{!}} Normal {{!}} Single {{!}} Accurate}}
```

(no default, list of values)

```
{{DEF|ISTART|1|if a {{FILE|WAVECAR}} file exists|0|else}}
```


In [3]:
for tag, value in database['INCAR'].items():
    print(f'{tag}')# {tagdef_templates[tag]} <---> {def_templates[tag]}')
    if value['type'] != "Unknown":
        print(f"*** type: {value['type']}")
    print(f"*** description: {value['description']}")
    if value['options']:
        print(f"*** options: {value['options']}")
    if value['default']:
        print(f"*** default: {value['default']}")
    print(f"-----------------------------------")

PREC
*** type: string
*** description: <tag-ref>PREC</tag-ref> specifies the "precision"-mode.
*** options: {'Low': '...[parsing not implemented]', 'Medium': '...[parsing not implemented]', 'High': '...[parsing not implemented]', 'Normal': '...[parsing not implemented]', 'Single': '...[parsing not implemented]', 'Accurate': '...[parsing not implemented]'}
*** default: Medium (for VASP.4.X) or Normal (for VASP.5.X)
-----------------------------------
ICHARG
*** type: integer
*** description: <tag-ref>ICHARG</tag-ref> determines how VASP constructs the <i>initial</i> charge density.
*** options: {'0': '...[parsing not implemented]', '1': '...[parsing not implemented]', '2': '...[parsing not implemented]', '4': '...[parsing not implemented]'}
*** default: 2 (if <tag-ref>ISTART</tag-ref>=0) or 0 (else)
-----------------------------------
ISTART
*** type: integer
*** description: <tag-ref>ISTART</tag-ref> determines whether or not to read the <file-ref>WAVECAR</file-ref> file.
*** options: 

# Old Debugging Crap

In [62]:
import mwparserfromhell as mwp
from mwcomposerfromhell import (
    ArticleResolver,
    compose,
    Namespace,
    WikicodeToHtmlComposer,
)

def _get_composer(templates):
    resolver = ArticleResolver()
    resolver.add_namespace("Template", Namespace(templates))
    return WikicodeToHtmlComposer(resolver=resolver)

#info = database['INCAR']['ISMEAR']['info']
#templates = {"TAG": mwp.parse('<tt>{{{1}}}</tt>'), "FILE": mwp.parse('<tt>{{{1}}}</tt>')}

#composer = _get_composer(templates)
#print(composer.compose(info))
#print('----------')

def get_html(title):
    pages = []

    if not isinstance(title, list):
        title = [title]
    for t in title:
        params = {
            "page": t,
            "action": "parse",
            "format": "json",
            "formatversion": "2",
        }
        headers = {"User-Agent": USER_AGENT}
        req = requests.get(API_URL, headers=headers, params=params)#.json()
        pages.append(req.json()['parse']['text'])

    return pages

# Not available on VASP wiki :(
def get_html_rest(title):
    pages = []

    headers = {"User-Agent": USER_AGENT}
    if not isinstance(title, list):
        title = [title]
    for t in title:
        url = REST_API_URL + "page/" + t + "/html"
        req = requests.get(url, headers=headers)

    return req

def get_html_render(title):
    pages = []

    if not isinstance(title, list):
        title = [title]
    for t in title:
        params = {
            "title": t,
            "action": "render",
        }
        headers = {"User-Agent": USER_AGENT}
        pages.append(requests.get(WIKI_URL, headers=headers, params=params).text)

    return pages

def get_html_parasoid(text):
    pages = []

    if not isinstance(text, list):
        text = [text]
    for t in text:
        data = {
            "wikitext": t,
        }
        PARSOID_URL = "https://en.wikipedia.org/api/rest_v1/transform/wikitext/to/html/"
        headers = {"User-Agent": USER_AGENT}
        req = requests.post(PARSOID_URL, headers=headers, data=data)#, timeout=(None, 3))
        return req
        pages.append(req.text)

    return pages

def get_html_rvparse(title):
    pages = []

    if isinstance(title, list):
        if len(title) > 50:
            chunks = [title[x : x + 50] for x in range(0, len(title), 50)]
            for chunk in chunks:
                pages.extend(parse(chunk))
            return pages
        else:
            title = "|".join(title)
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "titles": title,
        "format": "json",
        "formatversion": "2",
        "rvparse": "max",
    }
    #if get_text:
    #    params["rvslots"] = "main"
    headers = {"User-Agent": USER_AGENT}

    req = requests.get(API_URL, headers=headers, params=params).json()
    try:
        req_pages = req["query"]["pages"]
    except:
        raise ValueError(f"Something went wrong..., this is the response I got.\n. {req}")
    return req

    for page in req_pages:
        title = page["title"]
        if len(page["revisions"]) > 1:
            print(f"WARNING: More than one revision found for {title}. Using the first one.")
        rev = page["revisions"][0]
        text = rev["content"]
        pages.append(text)

    return pages

#pages = get_html_render(["ICHARG", "SIGMA"])
#print(pages[1])
#with open("test.html", "w") as f:
#    f.write(pages[1])
#text = [database['ICHARG']['text'].string, database['SIGMA']['text'].string]
#pages = get_html_parsoid(text)
#with open("test.html", "w") as f:
#    f.write(pages[0])

files = list(database.keys())
#n_pages = len(files) # 50
#files = files[:n_pages]
#text = [database[files[i]]['text'].string for i in range(n_pages)]
##pages = get_html(files)
#pages = get_html_parasoid(files)
#with open("test.html", "w") as f:
#    f.write(pages[0])
#req = get_html_rvparse(["ICHARG", "SIGMA"])

In [63]:
html = get_html_render(['ALGO'])[0]
with open('temp.html', 'w') as f:
    f.write(html)
#print(html)

# HTML conversion

In [None]:
import mwparserfromhell as mwp
import mwcomposerfromhell as mwc
from mwcomposerfromhell import (
    ArticleResolver,
    compose,
    Namespace,
    WikicodeToHtmlComposer,
)

import html

def _get_composer(templates):
    resolver = ArticleResolver()
    resolver.add_namespace("Template", Namespace(templates))
    return WikicodeToHtmlComposer(resolver=resolver)


import json
import pickle
#with open('test-vasp.json', 'r') as f:
#    database = json.load(f)

#with open('test-vasp.pickle', 'rb') as f:
#    database = pickle.load(f)

with open('test-vasp.pickle', 'wb') as f:
    pickle.dump(database, f, protocol=pickle.HIGHEST_PROTOCOL)

wikicode = database['INCAR']['ISIF']['info']
#templates = {"TAG": mwp.parse('<tag-ref>{{{1}}}</tag-ref>'), "FILE": mwp.parse('<file-ref>{{{1}}}</file-ref>')}
#templates = {}
#composer = _get_composer(templates)
page_html = mwc.compose(wikicode)
page_html = tidy_wikicode(page_html, templates=True, formatting=True, strip=False, math=False, unescape=False, footer=True)
#pattern = re.compile(r'<p>&lt;tag-ref&gt;(.*?)&lt;/tag-ref&gt;</p>')
#page_html = pattern.sub(r'<tt>\1</tt>', page_html)
#pattern = re.compile(r'<p>&lt;file-ref&gt;(.*?)&lt;/file-ref&gt;</p>')
#page_html = pattern.sub(r'<tt>\1</tt>', page_html)

#html = get_html_render(['ISMEAR'])[0]
with open('temp2.html', 'w') as f:
    f.write(page_html)
print(page_html)

In [None]:
#for section in wikicode.get_sections():
#    headings = section.filter_headings(matches=lambda heading: 'related tags' in heading.title.lower())
#    print(headings)
#    print('------------------*********--------------')

wikicode = database['INCAR']['IBRION']['info']
print(wikicode.rsplit('----', 1)[0])

# Benchmarking
Action API: 0.6s/page

Render: 0.6s/page

Rest API: 0.6s/page (can't actually get HTML so...)

Wikipedia Paradoid: ~0.3s/page but shitty formatting (sort of)

In [None]:
#req = get_html("INCAR")
#myjson = req.json()
# html_string = myjson['parse']['text']
#with open("test.html", "w") as f:
#    f.write(html_string)

# STRETCH GOAL
The formatting of these things is utter shit, it might be really diffcult to do the comment to explain value thing...

Look at the difference between ISMEAR and ICHARG. They're in lists, with possible indented list subitems, descriptions on same line or not, typos meaning there's no '=' after the value....

Focus on the HTML webpage for now

In [None]:
tag = "ICHARG"
text = database[tag]["text"]
sections = text.get_sections()
options = []
#for section in sections:
#    if section.title is not None:
#        # Strip title of whitespace (leading and trailing)
#        if section.title.strip() == "Tag options":
#            lists = section.get_lists()
wikilink="*{{TAG|"+tag+"}}"
lists = text.get_lists()
for lll in lists:
    for item in lll.fullitems:
        if item.startswith(wikilink):
            options.append(item.split(wikilink)[1].strip())

print(options)
#options_dict = {}
#for option in options:
#    setting = wtp.parse(option.split(":")[0]).plain_text()
#    explanation = wtp.parse(option.split(":")[1]).plain_text()
#    options_dict.update({setting: explanation})

#options_dict

# PAW stuff

In [None]:
tag = "Available PAW potentials"
text = database[tag]["text"]
sections = text.get_sections()[1:3]
bolds = []
for section in sections:
    print(section.title)
    for b in section.get_bolds():
        try:
            float(b.plain_text())
        except:
            print("Found recommended pseudo: " + b.plain_text())
            # TODO: doesn't work for the GW ones....
            bolds.append(b.plain_text())

#bolds