In [1]:
#import wikitextparser as wtp
import requests

from codex.database.vasp import generate_database, refresh_database
USER_AGENT = "dft-codex/0.0.0 (Omar A. Ashour, ashour@berkeley.edu)"
API_URL = "https://www.vasp.at/wiki/api.php"
WIKI_URL = "https://www.vasp.at/wiki/index.php"
REST_API_URL = "https://www.vasp.at/wiki/rest.php/v1/"

import logging 
logging.basicConfig(level=logging.INFO)

# Workflow
We want to parse 5 specific pages + every INCAR tag.

#### Option 1: Database exists
1. pull *only* the time stamp of the 5 pages + every INCAR tag.
2. compare time stamps to those in the database
3. update list of pages to parse, only need those that have been updated
4. parse the list of pages

#### Option 2: Database doesn't exist
1. pull the default list of pages 


In [2]:
database = {}

In [None]:
fudge_dates = True
# Change the date on 20 random entries in the database to 1980
import random
if fudge_dates and database:
    for i in range(20):
        title = random.choice(list(database['INCAR'].keys()))
        database['INCAR'][title]["last_revised"] = "1980-01-01T00:00:00Z"
        #print(f"Changed ['INCAR']['{title}'] to 1980-01-01T00:00:00Z")

In [6]:
if not database:
    logging.info("Database doesn't exist. Creating it now.")
    database = generate_database()
    logging.info("Database created.")
else:
    logging.info("Database exists. Checking for updates.")
    database = refresh_database(database)

INFO:root:Database exists. Checking for updates.
INFO:root:Updating the following pages:LMAXPAW, IMIX, NGX, NGYF, EVENONLY, Profiling, PFLAT, NGYROMAG, LMONO, LBERRY, SMEARINGS, VDW R0, QSPIRAL, ML NMDINT, ML W1, ML NATOM COUPLED, LUSENCCL, ML OUTBLOCK, ALDAX
INFO:root:Database is up to date.


# Converting to HTML

In [25]:
import mwcomposerfromhell as mwc
wikicode = database['INCAR']['ISMEAR']['info']
html_string = mwc.compose(wikicode)
print(html_string)

<p>{{TAGDEF|ISMEAR|-5 {{!}} -4 {{!}} -3 {{!}} -2 {{!}} -1 {{!}} 0 {{!}} [integer]>0 |1}}
</p><p>Description: {{TAG|ISMEAR}} determines how the partial occupancies <i>f</i><sub>n<b>k</b></sub> are set for each orbital. {{TAG|SIGMA}} determines the width of the smearing in eV.
<hr />
</p><h2> Tag options </h2>
<ul><li>{{TAG|ISMEAR}}=<i>N</i> (<i>N</i>&gt;0): method of Methfessel-Paxton order <i>N</i>.
</li></ul><dl><dd><b>Mind</b>: For the Methfessel-Paxton scheme the partial occupancies can be negative, as well as larger than 1. This can yield erroneous results for insulators.
</dd></dl><ul><li>{{TAG|ISMEAR}}=0: Gaussian smearing.
</li><li>{{TAG|ISMEAR}}=&minus;1: Fermi smearing.
</li><li>{{TAG|ISMEAR}}=&minus;2: partial occupancies are read in from the {{FILE|WAVECAR}} or {{FILE|INCAR}} file, and kept fixed throughout run.
</li></ul><dl><dd>To set the occupancies, specify
  {{TAG|FERWE}} = f(1) f(2) f(3) ... f({{TAG|NBANDS}}&times;N<sub><b>k</b></sub>)
</dd><dd>and for spin-polarized c

# Defaults, Options and Data Type

These can be obtained from `TAGDEF` and maybe `DEF` templates, depending on your luck...

The format is:

```
{{TAGDEF | TAGNAME | DATATYPE | DEFAULT}}
```

where DATATYPE is either:
```
1. integer
2. real
3. logical (also written as .TRUE. {{!}} .FALSE.)
4. (something) array (only seen real from a cursory look)
5. A list of options (e.g., 1 {{!}} 2-10 {{!}} [integer] > 0)
```

DEFAULT isn't always there, and if it is, could be a string with some more explanation or even 'not set' or '<math>10^{-4}</math>' (with the HTML tags).

the DEF template has the form:
```
{{DEF | TAGNAME | VALUE | CONDITION | VALUE | CONDITION | ....}}
```
where CONDITION is a string describing other tags (usually) and VALUE is an integer or string or whatever

lam

Example:
```
{{TAGDEF|PREC|Low {{!}} Medium {{!}} High {{!}} Normal {{!}} Single {{!}} Accurate}}
```

(no default, list of values)

```
{{DEF|ISTART|1|if a {{FILE|WAVECAR}} file exists|0|else}}
```


In [73]:
#wikicode = database['INCAR']['ISMEAR']['info']
options = {}
defaults = {}
for name, tag in database['INCAR'].items():
    wikicode = tag['info']
    ##data_type = wikicode.filter_templates()[0]
    ##data_type.params
    ##print(wikicode)
    options[name] = wikicode.filter_templates(matches=lambda template: template.name.matches("TAGDEF"))
    # DEfaults template should match DEF exactly, not any other things like TAGDEF
    defaults[name] = wikicode.filter_templates(matches=lambda template: template.name.matches("DEF"))
    if options[name]:
        print(f"*****{name}: {options[name]}\n")
    else:
        print(f"*****{name}: None\n")
    if defaults[name]:
        print(f"+++++{name}: {defaults[name][0]}\n")
    else:
        print(f"+++++{name}: None\n")

*****PREC: ['{{TAGDEF|PREC|Low {{!}} Medium {{!}} High {{!}} Normal {{!}} Single {{!}} Accurate}}']

+++++PREC: {{DEF|PREC|Medium|for VASP.4.X|Normal|for VASP.5.X}}

*****ICHARG: ['{{TAGDEF|ICHARG|0 {{!}} 1 {{!}} 2 {{!}} 4}}']

+++++ICHARG: {{DEF|ICHARG|2|if {{TAG|ISTART}}{{=}}0|0|else}}

*****ISTART: ['{{TAGDEF|ISTART|0 {{!}} 1 {{!}} 2 {{!}} 3}}']

+++++ISTART: {{DEF|ISTART|1|if a {{FILE|WAVECAR}} file exists|0|else}}

*****INIWAV: ['{{TAGDEF|INIWAV|0 {{!}} 1}}']

+++++INIWAV: {{DEF|INIWAV|1|}}

*****NSW: ['{{TAGDEF|NSW|[integer]|0}}']

+++++NSW: None

*****NELM: ['{{TAGDEF|NELM|[integer]|60}}']

+++++NELM: None

*****EDIFF: ['{{TAGDEF|EDIFF|[real]|<math>10^{-4}</math>}}']

+++++EDIFF: None

*****ALGO: ['{{TAGDEF|ALGO|Normal {{!}} VeryFast {{!}} Fast {{!}} Conjugate {{!}} All {{!}} Damped {{!}} Subrot {{!}} Eigenval {{!}} Exact {{!}} None {{!}} Nothing {{!}} CHI {{!}} G0W0 {{!}} GW0 {{!}} GW {{!}} scGW0 {{!}} scGW {{!}} G0W0R {{!}} GW0R {{!}} GWR {{!}} scGW0R {{!}} scGWR {{!}} ACFDT {

In [70]:
wikicode = database['INCAR']['PREC']['info']
wikicode.filter_templates()[6].name

'DEF'

# Old Debugging Crap

In [9]:
def get_html(title):
    pages = []

    if not isinstance(title, list):
        title = [title]
    for t in title:
        params = {
            "page": t,
            "action": "parse",
            "format": "json",
            "formatversion": "2",
        }
        headers = {"User-Agent": USER_AGENT}
        req = requests.get(API_URL, headers=headers, params=params)#.json()
        pages.append(req.json()['parse']['text'])

    return pages

# Not available on VASP wiki :(
def get_html_rest(title):
    pages = []

    headers = {"User-Agent": USER_AGENT}
    if not isinstance(title, list):
        title = [title]
    for t in title:
        url = REST_API_URL + "page/" + t + "/html"
        req = requests.get(url, headers=headers)

    return req

def get_html_render(title):
    pages = []

    if not isinstance(title, list):
        title = [title]
    for t in title:
        params = {
            "title": t,
            "action": "render",
        }
        headers = {"User-Agent": USER_AGENT}
        pages.append(requests.get(WIKI_URL, headers=headers, params=params).text)

    return pages

def get_html_parasoid(text):
    pages = []

    if not isinstance(text, list):
        text = [text]
    for t in text:
        data = {
            "wikitext": t,
        }
        PARSOID_URL = "https://en.wikipedia.org/api/rest_v1/transform/wikitext/to/html/"
        headers = {"User-Agent": USER_AGENT}
        req = requests.post(PARSOID_URL, headers=headers, data=data)#, timeout=(None, 3))
        return req
        pages.append(req.text)

    return pages

def get_html_rvparse(title):
    pages = []

    if isinstance(title, list):
        if len(title) > 50:
            chunks = [title[x : x + 50] for x in range(0, len(title), 50)]
            for chunk in chunks:
                pages.extend(parse(chunk))
            return pages
        else:
            title = "|".join(title)
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "titles": title,
        "format": "json",
        "formatversion": "2",
        "rvparse": "max",
    }
    #if get_text:
    #    params["rvslots"] = "main"
    headers = {"User-Agent": USER_AGENT}

    req = requests.get(API_URL, headers=headers, params=params).json()
    try:
        req_pages = req["query"]["pages"]
    except:
        raise ValueError(f"Something went wrong..., this is the response I got.\n. {req}")
    return req

    for page in req_pages:
        title = page["title"]
        if len(page["revisions"]) > 1:
            print(f"WARNING: More than one revision found for {title}. Using the first one.")
        rev = page["revisions"][0]
        text = rev["content"]
        pages.append(text)

    return pages

#pages = get_html_render(["ICHARG", "SIGMA"])
#print(pages[1])
#with open("test.html", "w") as f:
#    f.write(pages[1])
#text = [database['ICHARG']['text'].string, database['SIGMA']['text'].string]
#pages = get_html_parsoid(text)
#with open("test.html", "w") as f:
#    f.write(pages[0])

files = list(database.keys())
#n_pages = len(files) # 50
#files = files[:n_pages]
#text = [database[files[i]]['text'].string for i in range(n_pages)]
##pages = get_html(files)
#pages = get_html_parasoid(files)
#with open("test.html", "w") as f:
#    f.write(pages[0])
#req = get_html_rvparse(["ICHARG", "SIGMA"])

In [7]:
import mwcomposerfromhell as mwc
import mwparserfromhell as mwp


# Benchmarking
Action API: 0.6s/page

Render: 0.6s/page

Rest API: 0.6s/page (can't actually get HTML so...)

Wikipedia Paradoid: ~0.3s/page but shitty formatting (sort of)

In [8]:
#req = get_html("INCAR")
#myjson = req.json()
# html_string = myjson['parse']['text']
#with open("test.html", "w") as f:
#    f.write(html_string)

# STRETCH GOAL
The formatting of these things is utter shit, it might be really diffcult to do the comment to explain value thing...

Look at the difference between ISMEAR and ICHARG. They're in lists, with possible indented list subitems, descriptions on same line or not, typos meaning there's no '=' after the value....

Focus on the HTML webpage for now

In [9]:
tag = "ICHARG"
text = database[tag]["text"]
sections = text.get_sections()
options = []
#for section in sections:
#    if section.title is not None:
#        # Strip title of whitespace (leading and trailing)
#        if section.title.strip() == "Tag options":
#            lists = section.get_lists()
wikilink="*{{TAG|"+tag+"}}"
lists = text.get_lists()
for lll in lists:
    for item in lll.fullitems:
        if item.startswith(wikilink):
            options.append(item.split(wikilink)[1].strip())

print(options)
#options_dict = {}
#for option in options:
#    setting = wtp.parse(option.split(":")[0]).plain_text()
#    explanation = wtp.parse(option.split(":")[1]).plain_text()
#    options_dict.update({setting: explanation})

#options_dict

KeyError: 'ICHARG'

In [None]:
tag = "Available PAW potentials"
text = database[tag]["text"]
sections = text.get_sections()[1:3]
bolds = []
for section in sections:
    print(section.title)
    for b in section.get_bolds():
        try:
            float(b.plain_text())
        except:
            print("Found recommended pseudo: " + b.plain_text())
            # TODO: doesn't work for the GW ones....
            bolds.append(b.plain_text())

#bolds

 Recommended potentials for DFT calculations 
Found recommended pseudo: Sr_sv
Found recommended pseudo: Y_sv
Found recommended pseudo: Zr_sv
Found recommended pseudo: Nb_sv
Found recommended pseudo: Mo_sv
Found recommended pseudo: Tc_pv
Found recommended pseudo: Ru_pv
Found recommended pseudo: Rh_pv
Found recommended pseudo: Pd
Found recommended pseudo: Ag
Found recommended pseudo: Cd
Found recommended pseudo: In_d
Found recommended pseudo: Sn_d
Found recommended pseudo: Sb
Found recommended pseudo: Te
Found recommended pseudo: I
Found recommended pseudo: Xe
Found recommended pseudo: Cs_sv
Found recommended pseudo: Ba_sv
Found recommended pseudo: La
Found recommended pseudo: Ce
Found recommended pseudo: Pr_3
Found recommended pseudo: Nd_3
Found recommended pseudo: Pm_3
Found recommended pseudo: Sm_3
Found recommended pseudo: Eu_2
Found recommended pseudo: Gd_3
Found recommended pseudo: Tb_3
Found recommended pseudo: Dy_3
Found recommended pseudo: Ho_3
Found recommended pseudo: Er_3
Fou