In [None]:
import time, requests
from pathlib import Path
from typing import List, Tuple
import pycountry

API_URL      = "https://{lang}.wikipedia.org/w/api.php"
WD_API       = "https://www.wikidata.org/w/api.php"
HEADERS      = {"User-Agent": "MyWikiScraper/1.0"}
SLEEP        = 1e-5
MAX_DEPTH    = 4
OUTPUT_ROOT  = Path.cwd() / ""

MAIN_LANG    = "en"
MAIN_CAT     = ""

visited = set()

def sanitize(name: str) -> str:
    return "".join(c if c.isalnum() or c in " _-（）,." else "_" for c in name)

def query_api(lang: str, params: dict, wd: bool=False) -> dict:
    url = WD_API if wd else API_URL.format(lang=lang)
    resp = requests.get(url, params=params, headers=HEADERS)
    resp.raise_for_status()
    return resp.json()

def get_category_prefix(lang: str) -> str:
    data = query_api(lang, {
        "action":"query","format":"json",
        "meta":"siteinfo","siprop":"namespaces"
    })
    ns14 = data["query"]["namespaces"]["14"]
    alias = ns14.get("alias","")
    if not alias and "aliases" in ns14:
        for a in ns14["aliases"]:
            if a.get("ns")==14:
                alias = a.get("*","")
                break
    return alias or ns14.get("*","Category")

def get_category_members(lang: str, category: str, cmtype="page") -> List[str]:
    members = []
    prefix = get_category_prefix(lang) + ":"
    params = {
        "action":"query","format":"json","list":"categorymembers",
        "cmtitle": prefix + category,
        "cmtype": cmtype, "cmlimit":"max"
    }
    while True:
        data = query_api(lang, params)
        items = data["query"]["categorymembers"]
        if cmtype=="subcat":
            members += [m["title"].split(prefix,1)[-1] for m in items]
        else:
            members += [m["title"] for m in items]
        if "continue" in data:
            params.update(data["continue"])
            time.sleep(SLEEP)
        else:
            break
    return members

def get_wikidata_sitelinks(qid: str) -> dict:
    data = query_api("", {
        "action":"wbgetentities","ids":qid,
        "props":"sitelinks","format":"json"
    }, wd=True)
    ent = data["entities"][qid]["sitelinks"]
    out = {}
    for site,info in ent.items():
        if not site.endswith("wiki"):
            continue
        lc = site[:-4]
        title = info["title"]
        if ":" in title:
            _, local = title.split(":",1)
        else:
            local = title
        out[lc] = local
    return out

def fetch_extract(lang: str, title: str) -> str:
    data = query_api(lang, {
        "action":"query","format":"json",
        "prop":"extracts","explaintext":True,"titles": title
    })
    page = next(iter(data["query"]["pages"].values()))
    return page.get("extract","")

def get_category_qid(lang: str, category: str) -> str:
    prefix = get_category_prefix(lang)
    data = query_api(lang, {
        "action":"query","format":"json",
        "prop":"pageprops","ppprop":"wikibase_item",
        "titles": f"{prefix}:{category}"
    })
    pages = data["query"]["pages"]
    qid = next(iter(pages.values()))["pageprops"]["wikibase_item"]
    return qid

def scrape_category(lang: str, category: str, out_folder: Path, depth=1):
    print(f"{'  '*(depth-1)}Scraping [{lang}] Category: {category} (depth {depth})")
    out_folder.mkdir(parents=True, exist_ok=True)

    for title in get_category_members(lang, category, cmtype="page"):
        td = out_folder / sanitize(title)
        td.mkdir(exist_ok=True)
        variants = [(lang, title)]
        ll = query_api(lang, {
            "action":"query","format":"json",
            "prop":"langlinks","titles": title,"lllimit":"max"
        })
        pid = next(iter(ll["query"]["pages"]))
        for llk in ll["query"]["pages"][pid].get("langlinks",[]):
            variants.append((llk["lang"], llk["*"]))

        for lang2, loc_title in variants:
            key = (lang2, loc_title)
            if key in visited:
                continue
            visited.add(key)
            txt = fetch_extract(lang2, loc_title)
            outp = td / f"{sanitize(lang2)}.txt"
            outp.write_text(txt, encoding="utf-8")
            print(f"{'  '*(depth-1)}  ✔ {lang2}: {loc_title}")

    if depth < MAX_DEPTH:
        for sub in get_category_members(lang, category, cmtype="subcat"):
            scrape_category(lang, sub, out_folder/sanitize(sub), depth+1)


def main():
    OUTPUT_ROOT.mkdir(exist_ok=True)

    qid = get_category_qid(MAIN_LANG, MAIN_CAT)
    LANG_ROOTS = get_wikidata_sitelinks(qid)
    LANG_ROOTS.pop(MAIN_LANG, None)
    for lc, root_cat in LANG_ROOTS.items():
        print(f"\n=== Processing [{lc}] Category: {root_cat} ===")
        folder = OUTPUT_ROOT / f"{lc}_{sanitize(root_cat)}"
        folder.mkdir(parents=True, exist_ok=True)
        for sub in get_category_members(lc, root_cat, cmtype="subcat"):
            scrape_category(lc, sub, folder/sanitize(sub), depth=1)

    print("\nAll done.")

if __name__=="__main__":
    main()