In [1]:
import wikitextparser as wtp
import requests

In [2]:
API_URL = "https://www.vasp.at/wiki/api.php"
# API_URL = "https://en.wikipedia.org/w/api.php"


# action=query&generator=categorymembers&gcmtitle=Category:Physics&prop=categories&cllimit=max&gcmlimit=max
def get_category(category, gcmcontinue=None):
    params = {
        "action": "query",
        "generator": "categorymembers",
        "prop": "revisions",
        "gcmlimit": "max",
        "gcmtitle": category,
        "format": "json",
        "formatversion": "2",
    }
    if gcmcontinue:
        params["gcmcontinue"] = gcmcontinue

    # TODO: parse version from setup.py or something?
    headers = {"User-Agent": "dft-tutor/0.0.0"}
    req = requests.get(API_URL, headers=headers, params=params).json()

    # Figure out if we need to parse more pages
    gcmcontinue = None
    if "continue" in req:
        gcmcontinue = req["continue"]["gcmcontinue"]

    pages = []
    for page in req["query"]["pages"]:
        pages.append(
            {
                "pageid": page["pageid"],
                "title": page["title"],
                "last_revised": page["revisions"][0]["timestamp"],
                "text": None,
            }
        )

    return pages, gcmcontinue

def pull_incar_tags(parse_text=True):
    pages, gcmcontinue = get_category("Category:INCAR tag")
    while gcmcontinue is not None:
        cont_pages, gcmcontinue = get_category(
            "Category:INCAR tag", gcmcontinue=gmcontinue
        )
        pages.extend(cont_pages)

    # This entry breaks the wiki...
    bad_entry = "Construction:LKPOINTS WAN"
    titles = [page["title"] for page in pages]
    if bad_entry in titles:
        # Pop from pages
        pages.pop(titles.index(bad_entry))

    if parse_text:
        page_titles = [page["title"] for page in pages]
        pages = parse(page_titles)
    return pages


def parse(title, get_text=True):
    pages = []

    rvprop = "timestamp"
    if get_text:
        rvprop += "|content"
    if isinstance(title, list):
        if len(title) > 50:
            chunks = [title[x : x + 50] for x in range(0, len(title), 50)]
            for chunk in chunks:
                pages.extend(parse(chunk, get_text=get_text))
            return pages
        else:
            title = "|".join(title)
    params = {
        "action": "query",
        "prop": "revisions",
        "rvprop": rvprop,
        "titles": title,
        "format": "json",
        "formatversion": "2",
    }
    if get_text:
        params["rvslots"] = "main"
    headers = {"User-Agent": "dft-tutor/0.0.0"}

    req = requests.get(API_URL, headers=headers, params=params).json()
    try:
        req_pages = req["query"]["pages"]
    except:
        raise ValueError(f"Something went wrong..., this is the response I got.\n. {req}")

    for page in req_pages:
        pageid = page["pageid"]
        title = page["title"]
        rev = page["revisions"][0]
        timestamp = rev["timestamp"]
        text = None
        if get_text:
            text = wtp.parse(rev["slots"]["main"]["content"])
        pages.append({"pageid": pageid, "title": title, "last_revised": timestamp, "text": text})

    return pages



# Workflow
We want to parse 5 specific pages + every INCAR tag.

#### Option 1: Database exists
1. pull *only* the time stamp of the 5 pages + every INCAR tag.
2. compare time stamps to those in the database
3. update list of pages to parse, only need those that have been updated
4. parse the list of pages

#### Option 2: Database doesn't exist
1. pull the default list of pages 


In [3]:
database = {}

In [4]:
fudge_dates = True
# Change the date on 20 random entries in the database to 1980
import random
if fudge_dates and database:
    for i in range(20):
        title = random.choice(list(database.keys()))
        database[title]["last_revised"] = "1980-01-01T00:00:00Z"
        print(f"Changed {title} to 1980-01-01T00:00:00Z")

In [5]:
# These are the pages we always want to parse
page_titles = [
    "available PAW potentials",
    "POTCAR",
    "KPOINTS",
    "INCAR",
    "POSCAR",
]
if not database:
    print("Database doesn't exist. Creating it now.")
    pages = parse(page_titles, get_text=True)
    pages.extend(pull_incar_tags(parse_text=True))
    database = {page["title"]: page for page in pages}
    print("Database creared.")
else:
    print("Database exists. Checking for updates.")
    pages = parse(page_titles, get_text=False)
    pages.extend(pull_incar_tags(parse_text=False))
    date_only_database = {page["title"]: page for page in pages}
    pages_to_update = []
    for title, page in date_only_database.items():
        if title not in database or page["last_revised"] != database[title]["last_revised"]:
            pages_to_update.append(title)
    if pages_to_update:
        print("Updating the following pages:" + ", ".join(pages_to_update))
        updated_pages = parse(pages_to_update, get_text=True)
        for page in updated_pages:
            title = page["title"]
            database[title] = page
    else:
        print("Database is up to date.")

# TODO: put underscores back in keys

Database doesn't exist. Creating it now.
Database creared.


# STRETCH GOAL
The formatting of these things is utter shit, it might be really diffcult to do the comment to explain value thing...

Look at the difference between ISMEAR and ICHARG. They're in lists, with possible indented list subitems, descriptions on same line or not, typos meaning there's no '=' after the value....

Focus on the HTML webpage for now

In [6]:
tag = "ICHARG"
text = database[tag]["text"]
sections = text.get_sections()
options = []
#for section in sections:
#    if section.title is not None:
#        # Strip title of whitespace (leading and trailing)
#        if section.title.strip() == "Tag options":
#            lists = section.get_lists()
wikilink="*{{TAG|"+tag+"}}"
lists = text.get_lists()
for list in lists:
    for item in list.fullitems:
        if item.startswith(wikilink):
            options.append(item.split(wikilink)[1].strip())

print(options)
#options_dict = {}
#for option in options:
#    setting = wtp.parse(option.split(":")[0]).plain_text()
#    explanation = wtp.parse(option.split(":")[1]).plain_text()
#    options_dict.update({setting: explanation})

#options_dict

['=0', '=1', '=2', '=4', '+10']


In [73]:
tag = "Available PAW potentials"
text = database[tag]["text"]
sections = text.get_sections()[1:3]
bolds = []
for section in sections:
    print(section.title)
    for b in section.get_bolds():
        try:
            float(b.plain_text())
        except:
            print("Found recommended pseudo: " + b.plain_text())
            # TODO: doesn't work for the GW ones....
            bolds.append(b.plain_text())

#bolds

 Recommended potentials for DFT calculations 
Found recommended pseudo: Sr_sv
Found recommended pseudo: Y_sv
Found recommended pseudo: Zr_sv
Found recommended pseudo: Nb_sv
Found recommended pseudo: Mo_sv
Found recommended pseudo: Tc_pv
Found recommended pseudo: Ru_pv
Found recommended pseudo: Rh_pv
Found recommended pseudo: Pd
Found recommended pseudo: Ag
Found recommended pseudo: Cd
Found recommended pseudo: In_d
Found recommended pseudo: Sn_d
Found recommended pseudo: Sb
Found recommended pseudo: Te
Found recommended pseudo: I
Found recommended pseudo: Xe
Found recommended pseudo: Cs_sv
Found recommended pseudo: Ba_sv
Found recommended pseudo: La
Found recommended pseudo: Ce
Found recommended pseudo: Pr_3
Found recommended pseudo: Nd_3
Found recommended pseudo: Pm_3
Found recommended pseudo: Sm_3
Found recommended pseudo: Eu_2
Found recommended pseudo: Gd_3
Found recommended pseudo: Tb_3
Found recommended pseudo: Dy_3
Found recommended pseudo: Ho_3
Found recommended pseudo: Er_3
Fou