In [19]:
#import wikitextparser as wtp
import requests

from codex.vasp_database import pull_incar_tags, parse
USER_AGENT = "dft-codex/0.0.0 (Omar A. Ashour, ashour@berkeley.edu)"
API_URL = "https://www.vasp.at/wiki/api.php"
WIKI_URL = "https://www.vasp.at/wiki/index.php"
REST_API_URL = "https://www.vasp.at/wiki/rest.php/v1/"

# Workflow
We want to parse 5 specific pages + every INCAR tag.

#### Option 1: Database exists
1. pull *only* the time stamp of the 5 pages + every INCAR tag.
2. compare time stamps to those in the database
3. update list of pages to parse, only need those that have been updated
4. parse the list of pages

#### Option 2: Database doesn't exist
1. pull the default list of pages 


In [20]:
database = {}

In [21]:
fudge_dates = True
# Change the date on 20 random entries in the database to 1980
import random
if fudge_dates and database:
    for i in range(20):
        title = random.choice(list(database.keys()))
        database[title]["last_revised"] = "1980-01-01T00:00:00Z"
        print(f"Changed {title} to 1980-01-01T00:00:00Z")

In [22]:
# These are the pages we always want to parse
page_titles = [
    "available PAW potentials",
    "POTCAR",
    "KPOINTS",
    "INCAR",
    "POSCAR",
]

if not database:
    print("Database doesn't exist. Creating it now.")
    pages = parse(page_titles, get_text=True)
    pages.extend(pull_incar_tags(parse_text=True))
    database = {page["title"]: page for page in pages}
    print("Database created.")
else:
    print("Database exists. Checking for updates.")
    pages = parse(page_titles, get_text=False)
    pages.extend(pull_incar_tags(parse_text=False))
    date_only_database = {page["title"]: page for page in pages}
    pages_to_update = []
    for title, page in date_only_database.items():
        if title not in database or page["last_revised"] != database[title]["last_revised"]:
            pages_to_update.append(title)
    if pages_to_update:
        print("Updating the following pages:" + ", ".join(pages_to_update))
        updated_pages = parse(pages_to_update, get_text=True)
        for page in updated_pages:
            title = page["title"]
            database[title] = page
    else:
        print("Database is up to date.")

Database doesn't exist. Creating it now.
Database created.


In [93]:
def get_html(title):
    pages = []

    if not isinstance(title, list):
        title = [title]
    for t in title:
        params = {
            "page": t,
            "action": "parse",
            "format": "json",
            "formatversion": "2",
        }
        headers = {"User-Agent": USER_AGENT}
        req = requests.get(API_URL, headers=headers, params=params)#.json()
        pages.append(req.json()['parse']['text'])

    return pages

# Not available on VASP wiki :(
def get_html_rest(title):
    pages = []

    headers = {"User-Agent": USER_AGENT}
    if not isinstance(title, list):
        title = [title]
    for t in title:
        url = REST_API_URL + "page/" + t + "/html"
        req = requests.get(url, headers=headers)

    return req

def get_html_render(title):
    pages = []

    if not isinstance(title, list):
        title = [title]
    for t in title:
        params = {
            "title": t,
            "action": "render",
        }
        headers = {"User-Agent": USER_AGENT}
        pages.append(requests.get(WIKI_URL, headers=headers, params=params).text)

    return pages

def get_html_parasoid(text):
    pages = []

    if not isinstance(text, list):
        text = [text]
    for t in text:
        data = {
            "wikitext": t,
        }
        PARSOID_URL = "https://en.wikipedia.org/api/rest_v1/transform/wikitext/to/html/"
        headers = {"User-Agent": USER_AGENT}
        req = requests.post(PARSOID_URL, headers=headers, data=data)#, timeout=(None, 3))
        pages.append(req.text)

    return pages

def get_html_rvparse(title):
    pages = []

    if isinstance(title, list):
        if len(title) > 50:
            chunks = [title[x : x + 50] for x in range(0, len(title), 50)]
            for chunk in chunks:
                pages.extend(parse(chunk))
            return pages
        else:
            title = "|".join(title)
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": "content",
        "titles": title,
        "format": "json",
        "formatversion": "2",
        "rvparse": "max",
    }
    #if get_text:
    #    params["rvslots"] = "main"
    headers = {"User-Agent": USER_AGENT}

    req = requests.get(API_URL, headers=headers, params=params).json()
    try:
        req_pages = req["query"]["pages"]
    except:
        raise ValueError(f"Something went wrong..., this is the response I got.\n. {req}")
    return req

    for page in req_pages:
        title = page["title"]
        if len(page["revisions"]) > 1:
            print(f"WARNING: More than one revision found for {title}. Using the first one.")
        rev = page["revisions"][0]
        text = rev["content"]
        pages.append(text)

    return pages

#pages = get_html_render(["ICHARG", "SIGMA"])
#print(pages[1])
#with open("test.html", "w") as f:
#    f.write(pages[1])
#text = [database['ICHARG']['text'].string, database['SIGMA']['text'].string]
#pages = get_html_parsoid(text)
#with open("test.html", "w") as f:
#    f.write(pages[0])

files = list(database.keys())
#n_pages = len(files) # 50
#files = files[:n_pages]
#text = [database[files[i]]['text'].string for i in range(n_pages)]
##pages = get_html(files)
#pages = get_html_parasoid(files)
#with open("test.html", "w") as f:
#    f.write(pages[0])
req = get_html_rvparse(["ICHARG", "SIGMA"])

In [94]:
req['query']

{'pages': [{'pageid': 10,
   'ns': 0,
   'title': 'ICHARG',
   'revisions': [{'content': '<div class="mw-parser-output"><p><a class="mw-selflink selflink">ICHARG</a>&#160;= 0 | 1 | 2 | 4&#160;\n</p>\n<table>\n<tbody><tr>\n<td>Default: <b>ICHARG</b>\n</td>\n<td>= 2\n</td>\n<td>if <a href="/wiki/index.php/ISTART" title="ISTART">ISTART</a>=0\n</td></tr>\n<tr>\n<td>\n</td>\n<td>= 0\n</td>\n<td>else\n</td></tr>\n\n\n\n\n</tbody></table>\n<p>Description: <a class="mw-selflink selflink">ICHARG</a> determines how VASP constructs the <i>initial</i> charge density.\n</p>\n<hr />\n<ul><li><a class="mw-selflink selflink">ICHARG</a>=0</li></ul>\n<dl><dd>Calculate the charge density from initial wave functions.</dd>\n<dd>If <a href="/wiki/index.php/ISTART" title="ISTART">ISTART</a> is <i>internally reset</i> due to an invalid <a href="/wiki/index.php/WAVECAR" title="WAVECAR">WAVECAR</a> file, <a class="mw-selflink selflink">ICHARG</a> will be set to <a class="mw-selflink selflink">ICHARG</a>=2.</dd>

# Benchmarking
Action API: 0.6s/page

Render: 0.6s/page

Rest API: 0.6s/page (can't actually get HTML so...)

Wikipedia Paradoid: ~0.3s/page but shitty formatting (sort of)

In [24]:
#req = get_html("INCAR")
#myjson = req.json()
# html_string = myjson['parse']['text']
#with open("test.html", "w") as f:
#    f.write(html_string)

# STRETCH GOAL
The formatting of these things is utter shit, it might be really diffcult to do the comment to explain value thing...

Look at the difference between ISMEAR and ICHARG. They're in lists, with possible indented list subitems, descriptions on same line or not, typos meaning there's no '=' after the value....

Focus on the HTML webpage for now

In [25]:
tag = "ICHARG"
text = database[tag]["text"]
sections = text.get_sections()
options = []
#for section in sections:
#    if section.title is not None:
#        # Strip title of whitespace (leading and trailing)
#        if section.title.strip() == "Tag options":
#            lists = section.get_lists()
wikilink="*{{TAG|"+tag+"}}"
lists = text.get_lists()
for lll in lists:
    for item in lll.fullitems:
        if item.startswith(wikilink):
            options.append(item.split(wikilink)[1].strip())

print(options)
#options_dict = {}
#for option in options:
#    setting = wtp.parse(option.split(":")[0]).plain_text()
#    explanation = wtp.parse(option.split(":")[1]).plain_text()
#    options_dict.update({setting: explanation})

#options_dict

['=0', '=1', '=2', '=4', '+10']


In [26]:
tag = "Available PAW potentials"
text = database[tag]["text"]
sections = text.get_sections()[1:3]
bolds = []
for section in sections:
    print(section.title)
    for b in section.get_bolds():
        try:
            float(b.plain_text())
        except:
            print("Found recommended pseudo: " + b.plain_text())
            # TODO: doesn't work for the GW ones....
            bolds.append(b.plain_text())

#bolds

 Recommended potentials for DFT calculations 
Found recommended pseudo: Sr_sv
Found recommended pseudo: Y_sv
Found recommended pseudo: Zr_sv
Found recommended pseudo: Nb_sv
Found recommended pseudo: Mo_sv
Found recommended pseudo: Tc_pv
Found recommended pseudo: Ru_pv
Found recommended pseudo: Rh_pv
Found recommended pseudo: Pd
Found recommended pseudo: Ag
Found recommended pseudo: Cd
Found recommended pseudo: In_d
Found recommended pseudo: Sn_d
Found recommended pseudo: Sb
Found recommended pseudo: Te
Found recommended pseudo: I
Found recommended pseudo: Xe
Found recommended pseudo: Cs_sv
Found recommended pseudo: Ba_sv
Found recommended pseudo: La
Found recommended pseudo: Ce
Found recommended pseudo: Pr_3
Found recommended pseudo: Nd_3
Found recommended pseudo: Pm_3
Found recommended pseudo: Sm_3
Found recommended pseudo: Eu_2
Found recommended pseudo: Gd_3
Found recommended pseudo: Tb_3
Found recommended pseudo: Dy_3
Found recommended pseudo: Ho_3
Found recommended pseudo: Er_3
Fou