In [1]:
#import wikitextparser as wtp
import requests

from codex.database.vasp import generate_database, refresh_database, get_types_options_defaults
USER_AGENT = "dft-codex/0.0.0 (Omar A. Ashour, ashour@berkeley.edu)"
API_URL = "https://www.vasp.at/wiki/api.php"
WIKI_URL = "https://www.vasp.at/wiki/index.php"
REST_API_URL = "https://www.vasp.at/wiki/rest.php/v1/"

import logging 
import re
logging.basicConfig(level=logging.INFO)
import html
from copy import deepcopy

# Workflow
We want to parse 5 specific pages + every INCAR tag.

#### Option 1: Database exists
1. pull *only* the time stamp of the 5 pages + every INCAR tag.
2. compare time stamps to those in the database
3. update list of pages to parse, only need those that have been updated
4. parse the list of pages

#### Option 2: Database doesn't exist
1. pull the default list of pages 


In [2]:
database = {}

In [3]:
fudge_dates = True
# Change the date on 20 random entries in the database to 1980
import random
if fudge_dates and database:
    for i in range(20):
        title = random.choice(list(database['INCAR'].keys()))
        database['INCAR'][title]["last_revised"] = "1980-01-01T00:00:00Z"
        #print(f"Changed ['INCAR']['{title}'] to 1980-01-01T00:00:00Z")

In [4]:
if not database:
    logging.info("Database doesn't exist. Creating it now.")
    database = generate_database()
    logging.info("Database created.")
else:
    logging.info("Database exists. Checking for updates.")
    database = refresh_database(database)

INFO:root:Database doesn't exist. Creating it now.
INFO:root:Database created.


Made it to tidy_options! for PREC
Made it to tidy_options! for ICHARG
Made it to tidy_options! for ISTART
Made it to tidy_options! for INIWAV
Made it to tidy_options! for NSW
Made it to tidy_options! for NELM
Made it to tidy_options! for EDIFF
Made it to tidy_options! for ALGO
Made it to tidy_options! for LMAXMIX
Made it to tidy_options! for LVTOT
Made it to tidy_options! for NELMDL
Made it to tidy_options! for NELMIN
Made it to tidy_options! for IALGO
Made it to tidy_options! for NBANDS
Made it to tidy_options! for NELECT
Made it to tidy_options! for LREAL
Made it to tidy_options! for ROPT
Made it to tidy_options! for AEXX
Made it to tidy_options! for AGGAX
Made it to tidy_options! for AGGAC
Made it to tidy_options! for ALDAC
Made it to tidy_options! for LHFCALC
Made it to tidy_options! for TIME
Made it to tidy_options! for LMAXFOCK
Made it to tidy_options! for LMAXFOCKAE
Made it to tidy_options! for HFSCREEN
Made it to tidy_options! for EDIFFG
Made it to tidy_options! for GGA
Made it

# Converting to HTML

In [5]:
import mwcomposerfromhell as mwc
wikicode = database['INCAR']['ISMEAR']['info']
html_string = mwc.compose(wikicode)
#print(html_string)

# Defaults, Options and Data Type

These can be obtained from `TAGDEF` and maybe `DEF` templates, depending on your luck...

The format is:

```
{{TAGDEF | TAGNAME | DATATYPE | DEFAULT}}
```

where DATATYPE is either:
```
1. integer
2. real
3. logical (also written as .TRUE. {{!}} .FALSE.)
4. (something) array (only seen real from a cursory look)
5. A list of options (e.g., 1 {{!}} 2-10 {{!}} [integer] > 0)
```

DEFAULT isn't always there, and if it is, could be a string with some more explanation or even 'not set' or '<math>10^{-4}</math>' (with the HTML tags).

the DEF template has the form:
```
{{DEF | TAGNAME | VALUE | CONDITION | VALUE | CONDITION | ....}}
```
where CONDITION is a string describing other tags (usually) and VALUE is an integer or string or whatever

lam

Example:
```
{{TAGDEF|PREC|Low {{!}} Medium {{!}} High {{!}} Normal {{!}} Single {{!}} Accurate}}
```

(no default, list of values)

```
{{DEF|ISTART|1|if a {{FILE|WAVECAR}} file exists|0|else}}
```


In [5]:
for tag, value in database['INCAR'].items():
    print(f'{tag}')# {tagdef_templates[tag]} <---> {def_templates[tag]}')
    if value['type'] != "Unknown":
        print(f"*** type: {value['type']}")
    if value['options']:
        print(f"*** options: {value['options']}")
    if value['default']:
        print(f"*** default: {value['default']}")
    print(f"-----------------------------------")

PREC
*** type: string
*** options: {'Low': '...[parsing not implemented]', 'Medium': '...[parsing not implemented]', 'High': '...[parsing not implemented]', 'Normal': '...[parsing not implemented]', 'Single': '...[parsing not implemented]', 'Accurate': '...[parsing not implemented]'}
*** default: Medium (for VASP.4.X) or Normal (for VASP.5.X)
-----------------------------------
ICHARG
*** type: integer
*** options: {'0': '...[parsing not implemented]', '1': '...[parsing not implemented]', '2': '...[parsing not implemented]', '4': '...[parsing not implemented]'}
*** default: 2 (if <ref>ISTART</ref>=0) or 0 (else)
-----------------------------------
ISTART
*** type: integer
*** options: {'0': '...[parsing not implemented]', '1': '...[parsing not implemented]', '2': '...[parsing not implemented]', '3': '...[parsing not implemented]'}
*** default: 1 (if a <ref>WAVECAR</ref> file exists) or 0 (else)
-----------------------------------
INIWAV
*** type: integer
*** options: {'0': '...[parsing

In [9]:
database['INCAR']['ISMEAR']

{'title': 'ISMEAR',
 'type': 'string',
 'dimension': 1,
 'options': {'-5': '...[parsing not implemented]',
  '-4': '...[parsing not implemented]',
  '-3': '...[parsing not implemented]',
  '-2': '...[parsing not implemented]',
  '-1': '...[parsing not implemented]',
  '0': '...[parsing not implemented]',
  '[integer]>0': '...[parsing not implemented]'},
 'default': '1',
 'html': None,
 'id': 183,
 'last_revised': '2023-04-28T07:06:26Z'}

# Old Debugging Crap

In [27]:
import mwparserfromhell as mwp
from mwcomposerfromhell import (
    ArticleResolver,
    compose,
    Namespace,
    WikicodeToHtmlComposer,
)

def _get_composer(templates):
    resolver = ArticleResolver()
    resolver.add_namespace("Template", Namespace(templates))
    return WikicodeToHtmlComposer(resolver=resolver)

#info = database['INCAR']['ISMEAR']['info']
#templates = {"TAG": mwp.parse('<tt>{{{1}}}</tt>'), "FILE": mwp.parse('<tt>{{{1}}}</tt>')}

#composer = _get_composer(templates)
#print(composer.compose(info))
#print('----------')

def get_html(title):
    pages = []

    if not isinstance(title, list):
        title = [title]
    for t in title:
        params = {
            "page": t,
            "action": "parse",
            "format": "json",
            "formatversion": "2",
        }
        headers = {"User-Agent": USER_AGENT}
        req = requests.get(API_URL, headers=headers, params=params)#.json()
        pages.append(req.json()['parse']['text'])

    return pages

# Not available on VASP wiki :(
def get_html_rest(title):
    pages = []

    headers = {"User-Agent": USER_AGENT}
    if not isinstance(title, list):
        title = [title]
    for t in title:
        url = REST_API_URL + "page/" + t + "/html"
        req = requests.get(url, headers=headers)

    return req

def get_html_render(title):
    pages = []

    if not isinstance(title, list):
        title = [title]
    for t in title:
        params = {
            "title": t,
            "action": "render",
        }
        headers = {"User-Agent": USER_AGENT}
        pages.append(requests.get(WIKI_URL, headers=headers, params=params).text)

    return pages

def get_html_parasoid(text):
    pages = []

    if not isinstance(text, list):
        text = [text]
    for t in text:
        data = {
            "wikitext": t,
        }
        PARSOID_URL = "https://en.wikipedia.org/api/rest_v1/transform/wikitext/to/html/"
        headers = {"User-Agent": USER_AGENT}
        req = requests.post(PARSOID_URL, headers=headers, data=data)#, timeout=(None, 3))
        return req
        pages.append(req.text)

    return pages

def get_html_rvparse(title):
    pages = []

    if isinstance(title, list):
        if len(title) > 50:
            chunks = [title[x : x + 50] for x in range(0, len(title), 50)]
            for chunk in chunks:
                pages.extend(parse(chunk))
            return pages
        else:
            title = "|".join(title)
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "titles": title,
        "format": "json",
        "formatversion": "2",
        "rvparse": "max",
    }
    #if get_text:
    #    params["rvslots"] = "main"
    headers = {"User-Agent": USER_AGENT}

    req = requests.get(API_URL, headers=headers, params=params).json()
    try:
        req_pages = req["query"]["pages"]
    except:
        raise ValueError(f"Something went wrong..., this is the response I got.\n. {req}")
    return req

    for page in req_pages:
        title = page["title"]
        if len(page["revisions"]) > 1:
            print(f"WARNING: More than one revision found for {title}. Using the first one.")
        rev = page["revisions"][0]
        text = rev["content"]
        pages.append(text)

    return pages

#pages = get_html_render(["ICHARG", "SIGMA"])
#print(pages[1])
#with open("test.html", "w") as f:
#    f.write(pages[1])
#text = [database['ICHARG']['text'].string, database['SIGMA']['text'].string]
#pages = get_html_parsoid(text)
#with open("test.html", "w") as f:
#    f.write(pages[0])

files = list(database.keys())
#n_pages = len(files) # 50
#files = files[:n_pages]
#text = [database[files[i]]['text'].string for i in range(n_pages)]
##pages = get_html(files)
#pages = get_html_parasoid(files)
#with open("test.html", "w") as f:
#    f.write(pages[0])
#req = get_html_rvparse(["ICHARG", "SIGMA"])

In [32]:
html = get_html_render(['ISMEAR'])[0]
with open('ismear.html', 'w') as f:
    f.write(html)

# Benchmarking
Action API: 0.6s/page

Render: 0.6s/page

Rest API: 0.6s/page (can't actually get HTML so...)

Wikipedia Paradoid: ~0.3s/page but shitty formatting (sort of)

In [8]:
#req = get_html("INCAR")
#myjson = req.json()
# html_string = myjson['parse']['text']
#with open("test.html", "w") as f:
#    f.write(html_string)

# STRETCH GOAL
The formatting of these things is utter shit, it might be really diffcult to do the comment to explain value thing...

Look at the difference between ISMEAR and ICHARG. They're in lists, with possible indented list subitems, descriptions on same line or not, typos meaning there's no '=' after the value....

Focus on the HTML webpage for now

In [9]:
tag = "ICHARG"
text = database[tag]["text"]
sections = text.get_sections()
options = []
#for section in sections:
#    if section.title is not None:
#        # Strip title of whitespace (leading and trailing)
#        if section.title.strip() == "Tag options":
#            lists = section.get_lists()
wikilink="*{{TAG|"+tag+"}}"
lists = text.get_lists()
for lll in lists:
    for item in lll.fullitems:
        if item.startswith(wikilink):
            options.append(item.split(wikilink)[1].strip())

print(options)
#options_dict = {}
#for option in options:
#    setting = wtp.parse(option.split(":")[0]).plain_text()
#    explanation = wtp.parse(option.split(":")[1]).plain_text()
#    options_dict.update({setting: explanation})

#options_dict

KeyError: 'ICHARG'

# PAW stuff

In [None]:
tag = "Available PAW potentials"
text = database[tag]["text"]
sections = text.get_sections()[1:3]
bolds = []
for section in sections:
    print(section.title)
    for b in section.get_bolds():
        try:
            float(b.plain_text())
        except:
            print("Found recommended pseudo: " + b.plain_text())
            # TODO: doesn't work for the GW ones....
            bolds.append(b.plain_text())

#bolds

 Recommended potentials for DFT calculations 
Found recommended pseudo: Sr_sv
Found recommended pseudo: Y_sv
Found recommended pseudo: Zr_sv
Found recommended pseudo: Nb_sv
Found recommended pseudo: Mo_sv
Found recommended pseudo: Tc_pv
Found recommended pseudo: Ru_pv
Found recommended pseudo: Rh_pv
Found recommended pseudo: Pd
Found recommended pseudo: Ag
Found recommended pseudo: Cd
Found recommended pseudo: In_d
Found recommended pseudo: Sn_d
Found recommended pseudo: Sb
Found recommended pseudo: Te
Found recommended pseudo: I
Found recommended pseudo: Xe
Found recommended pseudo: Cs_sv
Found recommended pseudo: Ba_sv
Found recommended pseudo: La
Found recommended pseudo: Ce
Found recommended pseudo: Pr_3
Found recommended pseudo: Nd_3
Found recommended pseudo: Pm_3
Found recommended pseudo: Sm_3
Found recommended pseudo: Eu_2
Found recommended pseudo: Gd_3
Found recommended pseudo: Tb_3
Found recommended pseudo: Dy_3
Found recommended pseudo: Ho_3
Found recommended pseudo: Er_3
Fou