In [42]:
# scrape_datasheets_from_faction_index.py
import os, json, time, re, sys, traceback, contextlib
from typing import List, Dict, Optional, Tuple

from selenium import webdriver
from selenium.common.exceptions import (
    NoSuchElementException,
    TimeoutException,
    StaleElementReferenceException,
    WebDriverException,
    ElementClickInterceptedException,
    ElementNotInteractableException,
)
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

BASE = "https://39k.pro/datasheet/"


EXPECTED_TITLES = [
    "Ranged Weapons",
    "Melee Weapons",
    "Abilities",
    "Leader",
    "Unit Composition",
    "Keywords",
    "Led By",
]

In [43]:

def wait_for(driver, selector, timeout=20):
    return WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))

# ---------- safe selenium wrappers ----------
def safe_text(el) -> str:
    try:
        t = el.text
        if t:
            return t.strip()
    except Exception:
        pass
    try:
        t = el.get_attribute("innerText") or ""
        return t.strip()
    except Exception:
        return ""

def find_one(parent, by, selector) -> Optional[object]:
    with contextlib.suppress(Exception):
        return parent.find_element(by, selector)
    return None

def find_all(parent, by, selector) -> List[object]:
    with contextlib.suppress(Exception):
        return parent.find_elements(by, selector)
    return []

def js_click(driver, el) -> bool:
    try:
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
        time.sleep(0.05)
        el.click()
        return True
    except (ElementClickInterceptedException, ElementNotInteractableException, WebDriverException):
        with contextlib.suppress(Exception):
            driver.execute_script("arguments[0].click();", el)
            return True
    except Exception:
        return False
    return False

def expand_all_collapsibles(driver, card):
    """Aggressively expand all collapsible sections in the datacard"""
    # get all child divs of the class datacard
    child_divs = card.find_elements(By.XPATH, "./div")
    for div in child_divs:
        js_click(driver, div)

In [44]:

# ---------- path setup ----------
try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

FACTIONS_PATH = os.path.join(SCRIPT_DIR, "factions.json")
OUTPUT_ROOT   = os.path.join(SCRIPT_DIR, "datasheets")

# ---------- generic helpers ----------
def safe_filename(name: str) -> str:
    return re.sub(r'[^a-zA-Z0-9._ -]', '_', name or "").strip(" _.")

def chrome_driver(headless=True):
    opts = webdriver.ChromeOptions()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--window-size=1366,900")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")
    # Anti-automation tweaks
    opts.add_argument("--disable-blink-features=AutomationControlled")
    opts.add_argument("--lang=en-US,en")
    opts.add_argument(
        "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    )
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
    driver.set_page_load_timeout(60)
    return driver


def atomic_write_json(path: str, data: dict):
    tmp = f"{path}.tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
        f.flush()
        os.fsync(f.fileno())
    os.replace(tmp, path)

def write_failure_log(failures: list, where: str):
    fail_log = os.path.join(SCRIPT_DIR, "failures.json")
    try:
        atomic_write_json(fail_log, {"where": where, "failures": failures})
    except Exception as e:
        print(f"[WARN] Could not write {fail_log}: {e}", file=sys.stderr)

In [45]:
def parse_weapons(card, section_class: str):
    parsed = []
    sec = find_one(card, By.CSS_SELECTOR, section_class)
    if not sec:
        return parsed
    for w in find_all(sec, By.CSS_SELECTOR, ".weapon"):
        blocks = find_all(w, By.XPATH, "./div[contains(@class,'weapon_name') or contains(@class,'weapon_characteristics') or contains(@class,'weapon_abilities')]")
        current = None
        for b in blocks:
            cls = (b.get_attribute("class") or "")
            if "weapon_name" in cls:
                if current:
                    parsed.append(current)
                current = {"name": safe_text(b), "stats": {}, "abilities": []}
            elif "weapon_characteristics" in cls:
                if current is None:
                    current = {"name": "", "stats": {}, "abilities": []}
                for dv in find_all(b, By.CSS_SELECTOR, "div"):
                    key = (dv.get_attribute("class") or "").strip()
                    val = safe_text(dv)
                    if key:
                        current["stats"][key] = val
            elif "weapon_abilities" in cls:
                if current is None:
                    current = {"name": "", "stats": {}, "abilities": []}
                for a in find_all(b, By.CSS_SELECTOR, ".weapon_ability"):
                    t = safe_text(a)
                    if t:
                        current["abilities"].append(t)
        if current:
            parsed.append(current)
    return parsed

In [46]:
def parse_datasheet_name(card) -> str:
    name_el = find_one(card, By.CSS_SELECTOR, ".name")
    return safe_text(name_el)

In [47]:
def parse_invulnerable_save(card) -> str:
    invuln_el = find_one(card, By.CSS_SELECTOR, ".invulnerable_save")
    return safe_text(invuln_el) if invuln_el else ""

In [57]:
def parse_miniatures(card) -> List[Dict]:
    minis = []
    miniatures = find_one(card, By.CSS_SELECTOR, ".miniature")
    if not miniatures:
        name = parse_datasheet_name(card)
        headers = [safe_text(h) for h in find_all(card, By.CSS_SELECTOR, ".characteristics_header > div")]
        values  = [safe_text(v) for v in find_all(card, By.CSS_SELECTOR, ".characteristics > div > div")]
        characteristics = dict(zip(headers, values))
        minis.append({
            "name": name or None,
            "characteristics": characteristics or None
        })
    for mini in find_all(card, By.CSS_SELECTOR, ".miniature"):
        mini_name_el = find_one(mini, By.CSS_SELECTOR, ".header")
        mini_name = safe_text(mini_name_el) if mini_name_el else ""
        characteristics = {}
        headers = [safe_text(h) for h in find_all(mini, By.CSS_SELECTOR, ".characteristics_header > div")]
        values  = [safe_text(v) for v in find_all(mini, By.CSS_SELECTOR, ".characteristics > div > div")]
        characteristics = dict(zip(headers, values))
        minis.append({
            "name": mini_name or None,
            "characteristics": characteristics or None
        })
    return minis

In [58]:
def parse_abilities(card) -> Dict[str, List[Dict]]:
    abilities = []
    sec = find_one(card, By.CSS_SELECTOR, ".abilities")
    if not sec:
        return abilities
    for ab in find_all(sec, By.CSS_SELECTOR, ".ability"):
        ab_name_el = find_one(ab, By.CSS_SELECTOR, ".ability_name")
        ab_rule_el = find_one(ab, By.CSS_SELECTOR, ".ability_rule")
        abilities.append({
            "name": safe_text(ab_name_el),
            "rule": safe_text(ab_rule_el)
        })
    return abilities

In [59]:
def parse_wargear_options(card) -> List[str]:
    options = []
    sec = find_one(card, By.CSS_SELECTOR, ".wargear_rules")
    if not sec:
        return options
    for li in find_all(sec, By.CSS_SELECTOR, "li.wargear_rule"):
        t = safe_text(li)
        if t:
            options.append(t)
    return options

In [60]:
def parse_unit_composition(card) -> Tuple[str, List[Dict]]:
    unit_composition = ""
    composition_table = []
    comp_div = find_one(card, By.CSS_SELECTOR, ".unit_composition .composition")
    if comp_div:
        unit_composition = safe_text(comp_div)
        for row in find_all(comp_div, By.CSS_SELECTOR, "table tbody tr"):
            cols = find_all(row, By.TAG_NAME, "td")
            if len(cols) >= 3:
                composition_table.append({
                    "model":  safe_text(cols[0]),
                    "count":  safe_text(cols[1]),
                    "points": safe_text(cols[2]),
                })
    return unit_composition, composition_table

In [61]:
def parse_led_by(card) -> List[Dict]:
    led_by = []
    led_div = find_one(card, By.XPATH, ".//div[.//div[text()='Led By']]/following-sibling::div")
    if led_div:
        for li in find_all(led_div, By.CSS_SELECTOR, "li a"):
            led_by.append({
                "name": safe_text(li),
                "id": (li.get_attribute("href") or "").split("/")[-1]
            })
    return led_by

In [62]:
def parse_leader(card):
    leader = []
    leader_div = find_one(card, By.XPATH, ".//div[.//div[contains(@class,'header') and text()='Leader']]/following-sibling::div")

    if leader_div:
        for li in find_all(leader_div, By.CSS_SELECTOR, "li a"):
            leader.append({
                "name": safe_text(li),
                "id": (li.get_attribute("href") or "").split("/")[-1]
            })
    return leader


In [67]:








# ---------- core parsing ----------
def scrape_datasheet_once(driver, url, faction_name, faction_id, datasheet_id) -> Dict:
    driver.get(url)
    wait_for(driver, ".datacard")
    card = driver.find_element(By.CSS_SELECTOR, ".datacard")
    expand_all_collapsibles(driver, card)



    datasheet_name = parse_datasheet_name(card)
    miniatures = parse_miniatures(card)
    invulnerable_save = parse_invulnerable_save(card)
    ranged_weapons = parse_weapons(card, ".weapons_ranged")
    melee_weapons  = parse_weapons(card, ".weapons_melee")
    abilities = parse_abilities(card)
    wargear_options = parse_wargear_options(card)
    unit_composition, unit_composition_table = parse_unit_composition(card)
    leader = parse_leader(card)
    led_by = parse_led_by(card)
    keywords = parse_keywords(card)
    custom_rules = parse_custom_rules(card)





    return {
        "faction":faction_name,
        "faction_id": faction_id,
        "datasheet_id": datasheet_id,
        "datasheet_name":datasheet_name,
        "url":url,
        "miniatures": miniatures,
        "invulnerable_save": invulnerable_save,
        "ranged_weapons": ranged_weapons,
        "melee_weapons": melee_weapons,
        "abilities":abilities,
        "wargear_options": wargear_options,
        "unit_composition": unit_composition,
        "unit_composition_table": unit_composition_table,
        "leader": leader,
        "led_by": led_by,
        "keywords": keywords,
        "custom_rules": custom_rules,
    }

def scrape_datasheet_with_retries(driver, url, faction_name, faction_id, datasheet_id, retries=3, delay=0.6):
    last_exc = None
    for attempt in range(1, retries+1):
        try:
            return scrape_datasheet_once(driver, url, faction_name, faction_id, datasheet_id)
        except (TimeoutException, StaleElementReferenceException, WebDriverException, NoSuchElementException) as e:
            last_exc = e
            print(f"[WARN] Attempt {attempt}/{retries} failed for {datasheet_id}: {e.__class__.__name__}")
            time.sleep(delay)
    if last_exc:
        raise last_exc
    raise RuntimeError("Unknown failure without exception")

# ---------- main ----------
def main():
    if not os.path.exists(FACTIONS_PATH):
        print(f"[ERROR] factions.json not found at: {FACTIONS_PATH}")
        return

    with open(FACTIONS_PATH, "r", encoding="utf-8") as f:
        factions = json.load(f)

    total = sum(len(f.get("datasheets", [])) for f in factions)
    processed = 0
    failures = []

    os.makedirs(OUTPUT_ROOT, exist_ok=True)
    print(f"[INFO] Writing JSON files under: {OUTPUT_ROOT}")

    use_tqdm = False
    try:
        from tqdm import tqdm  # type: ignore
        bar = tqdm(total=total, desc="Datasheets")
        use_tqdm = True
    except Exception:
        pass

    driver = chrome_driver(headless=True)

    try:
        for faction in factions:
            faction_name = faction.get("faction")
            faction_id   = faction.get("faction_id")
            folder = os.path.join(OUTPUT_ROOT, f"{safe_filename(faction_name)}_{faction_id}")
            os.makedirs(folder, exist_ok=True)

            for ds in faction.get("datasheets", []):
                ds_name = ds.get("datasheet_name")
                ds_id   = ds.get("datasheet_id")
                url     = BASE + ds_id
                outfile = os.path.join(folder, f"{safe_filename(ds_name)}.json")

                if os.path.exists(outfile):
                    processed += 1
                    remaining = total - processed
                    print(f"[{processed}/{total}] SKIP (exists): {ds_name} -> {outfile} | Left: {remaining}")
                    if use_tqdm: bar.update(1)
                    continue

                try:
                    data = scrape_datasheet_with_retries(driver, url, faction_name, faction_id, ds_id, retries=3)
                    # fill minimum fields
                    data = data or {}
                    data.setdefault("url", url)
                    data.setdefault("faction", faction_name)
                    data.setdefault("faction_id", faction_id)
                    data.setdefault("datasheet_id", ds_id)
                    data.setdefault("datasheet_name", ds_name)

                    atomic_write_json(outfile, data)
                    print(f"[WRITE] {outfile}")

                except Exception as e:
                    tb = traceback.format_exc(limit=8)
                    print(f"[ERROR] {ds_name} ({ds_id}) failed: {e}\n{tb}")
                    err = {
                        "faction": faction_name,
                        "faction_id": faction_id,
                        "datasheet": ds_name,
                        "id": ds_id,
                        "url": url,
                        "error": str(e),
                        "traceback": tb,
                    }
                    failures.append(err)
                    write_failure_log(failures, where=f"scraping {ds_id}")

                    # Save debug artifacts
                    try:
                        html_path = os.path.join(folder, f"{safe_filename(ds_name)}.__error__.html")
                        with open(html_path, "w", encoding="utf-8") as h:
                            h.write(driver.page_source or "")
                        err_json = os.path.join(folder, f"{safe_filename(ds_name)}.__ERROR__.json")
                        atomic_write_json(err_json, err)
                        print(f"[DEBUG] Saved debug to {html_path} & {err_json}")
                    except Exception as ex:
                        print(f"[WARN] Could not save debug artifacts: {ex}", file=sys.stderr)

                finally:
                    processed += 1
                    remaining = total - processed
                    print(f"[{processed}/{total}] {ds_name} | Left: {remaining}")
                    if use_tqdm: bar.update(1)

                time.sleep(0.20)  # throttle a bit

    finally:
        with contextlib.suppress(Exception):
            driver.quit()
        if use_tqdm:
            bar.close()

    write_failure_log(failures, where="finalize")
    print("\n=== Summary ===")
    print(f"Processed: {processed} / {total}")
    print(f"Failures : {len(failures)}")
    if failures:
        print(f"See failures.json in {SCRIPT_DIR} and *__ERROR__.json/.__error__.html under faction folders.")

if __name__ == "__main__":
    main()

[INFO] Writing JSON files under: /Users/oliver.iyer/PyCharmMiscProject/datasheets
[1/979] SKIP (exists): Castellan -> /Users/oliver.iyer/PyCharmMiscProject/datasheets/Codex Supplement_ Black Templars_oCfJYt7fR9g/Castellan.json | Left: 978
[2/979] SKIP (exists): Chaplain Grimaldus -> /Users/oliver.iyer/PyCharmMiscProject/datasheets/Codex Supplement_ Black Templars_oCfJYt7fR9g/Chaplain Grimaldus.json | Left: 977
[3/979] SKIP (exists): Crusade Ancient -> /Users/oliver.iyer/PyCharmMiscProject/datasheets/Codex Supplement_ Black Templars_oCfJYt7fR9g/Crusade Ancient.json | Left: 976
[4/979] SKIP (exists): Crusader Squad -> /Users/oliver.iyer/PyCharmMiscProject/datasheets/Codex Supplement_ Black Templars_oCfJYt7fR9g/Crusader Squad.json | Left: 975
[5/979] SKIP (exists): Emperor’s Champion -> /Users/oliver.iyer/PyCharmMiscProject/datasheets/Codex Supplement_ Black Templars_oCfJYt7fR9g/Emperor_s Champion.json | Left: 974
[6/979] SKIP (exists): Execrator -> /Users/oliver.iyer/PyCharmMiscProject/d

In [63]:
def parse_keywords(card) -> Dict[str, List[str]]:
    keywords = {}
    fk = find_one(card, By.CSS_SELECTOR, ".faction_keywords")
    kw = find_one(card, By.CSS_SELECTOR, ".keywords")
    if fk:
        keywords["faction_keywords"] = [k.strip() for k in (safe_text(fk) or "").split(",") if k.strip()]
    if kw:
        keywords["keywords"] = [k.strip() for k in (safe_text(kw) or "").split(",") if k.strip()]
    return keywords

In [64]:
def parse_custom_rules(card):
    custom_rules = []
    # get all child divs of the class datacard, that dont have a classname at all
    child_divs = card.find_elements(By.XPATH, "./div[not(@class)]")
    # filter out divs that contain any of the EXPECTED_TITLES
    for div in child_divs:
        title_el = find_one(div, By.CSS_SELECTOR, ".collapsible_header .header")
        title = safe_text(title_el) if title_el else ""
        if title in EXPECTED_TITLES:
            continue
        # get the content from the last child div
        try:
            last_child = find_one(div, By.XPATH, "./div[last()]")
            content = safe_text(last_child) if last_child else ""
        except Exception:
            content = ""
        if title:
            custom_rules.append({
                "title": title,
                "text": content or None,
            })

    return custom_rules