In [1]:
# %pip install selenium webdriver-manager

import json
import os
import re
import time
from dataclasses import dataclass, asdict
from typing import Any, Dict, List

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


BASE = "https://39k.pro"
DETACHMENT_URL = BASE + "/detachment/{}"
FACTIONS_JSON = "factions.json"
OUTPUT_JSON = "detachment.json"


# ---------------- Selenium setup ----------------
def chrome_driver(headless: bool = True) -> webdriver.Chrome:
    opts = Options()
    if headless:
        # new headless mode for Chrome 109+
        opts.add_argument("--headless=new")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--window-size=1280,1600")
    opts.add_argument("--disable-dev-shm-usage")
    service = Service(ChromeDriverManager().install())
    return webdriver.Chrome(service=service, options=opts)


# ---------------- Data model ----------------
@dataclass
class DetachmentRecord:
    faction_name: str
    faction_id: str
    detachment_name: str
    detachment_id: str
    url: str
    rules: List[Dict[str, Any]]
    enhancements: List[Dict[str, Any]]
    stratagems: List[Dict[str, Any]]


# ---------------- I/O helpers ----------------
def load_factions(path: str = FACTIONS_JSON) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_existing_detachments(path: str = OUTPUT_JSON) -> Dict[str, Dict[str, Any]]:
    if not os.path.exists(path):
        return {}
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return {rec["detachment_id"]: rec for rec in data}

def save_detachments(records_by_id: Dict[str, Dict[str, Any]], path: str = OUTPUT_JSON):
    items = list(records_by_id.values())
    items.sort(key=lambda r: (r.get("faction_name",""), r.get("detachment_name","")))
    with open(path, "w", encoding="utf-8") as f:
        json.dump(items, f, ensure_ascii=False, indent=2)


# ---------------- Scraping utils ----------------
def _inner_text(el) -> str:
    try:
        return el.text.strip()
    except Exception:
        return ""

def _click_all_collapsibles(driver):
    """
    Try to expand everything once to help lazy-rendered content appear.
    """
    selectors = [
        ".enhancements .collapsible_header",
        ".stratagems .collapsible_header",
        ".collapsible_header",
    ]
    for sel in selectors:
        try:
            headers = driver.find_elements(By.CSS_SELECTOR, sel)
            for h in headers:
                try:
                    driver.execute_script("arguments[0].scrollIntoView({block:'center'});", h)
                    h.click()
                    time.sleep(0.12)
                except Exception:
                    pass
        except Exception:
            pass

def _parse_rules(driver) -> List[Dict[str, str]]:
    """
    Rules appear as multiple <div class="rule"> blocks under <h2>Rules</h2>.
    We capture their text in order and deduplicate identical repeats.
    """
    items = []
    try:
        rule_divs = driver.find_elements(By.CSS_SELECTOR, "main .rule")
        for div in rule_divs:
            txt = _inner_text(div)
            if txt:
                items.append({"text": txt})
    except Exception:
        pass

    deduped, seen = [], set()
    for r in items:
        key = r["text"]
        if key not in seen:
            seen.add(key)
            deduped.append(r)
    return deduped

def _expand_and_get(driver, item, header_sel, body_sel, wait_seconds=2) -> str:
    """
    Click item's header, wait for the body selector to be present *inside that item*,
    then return concatenated body text (some pages split content across multiple blocks).
    """
    try:
        hdr = item.find_element(By.CSS_SELECTOR, header_sel)
        driver.execute_script("arguments[0].scrollIntoView({block:'center'});", hdr)
        hdr.click()
    except Exception:
        pass

    # wait for at least one body element
    try:
        WebDriverWait(item, wait_seconds).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, body_sel))
        )
    except Exception:
        # if it never appears, we’ll attempt a fallback below
        pass

    parts = []
    try:
        bodies = item.find_elements(By.CSS_SELECTOR, body_sel)
        for b in bodies:
            t = _inner_text(b)
            if t:
                parts.append(t)
    except Exception:
        pass
    return "\n\n".join(parts).strip()

def parse_enhancements(driver) -> List[Dict[str, str]]:
    """
    Enhancements:
      container: main .enhancements
      item:      .enhancement
      name:      .enhancement_name
      body:      .enhancement_rule  (visible after expanding header)
    """
    out = []
    try:
        container = driver.find_element(By.CSS_SELECTOR, "main .enhancements")
    except Exception:
        return out

    items = container.find_elements(By.CSS_SELECTOR, ".enhancement")
    for it in items:
        try:
            name = it.find_element(By.CSS_SELECTOR, ".enhancement_name").text.strip()
        except Exception:
            name = ""

        text = _expand_and_get(
            driver,
            item=it,
            header_sel=".collapsible_header",
            body_sel=".enhancement_rule",
            wait_seconds=3
        )

        # fallback: if still empty, try item.text minus the name prefix
        if not text:
            full = _inner_text(it)
            if name and full.startswith(name):
                text = full[len(name):].strip()

        if name or text:
            # Optional: normalize "Cost: 20" etc. (keep raw by default)
            out.append({"name": name, "text": text})
    return out

def parse_stratagems(driver) -> List[Dict[str, str]]:
    """
    Stratagems:
      container: main .stratagems
      item:      .stratagem
      name:      .stratagem_name
      body:      .stratagem_rules (visible after expanding header)
    """
    out = []
    try:
        container = driver.find_element(By.CSS_SELECTOR, "main .stratagems")
    except Exception:
        return out

    items = container.find_elements(By.CSS_SELECTOR, ".stratagem")
    for it in items:
        try:
            name = it.find_element(By.CSS_SELECTOR, ".stratagem_name").text.strip()
        except Exception:
            name = ""

        text = _expand_and_get(
            driver,
            item=it,
            header_sel=".collapsible_header",
            body_sel=".stratagem_rules",
            wait_seconds=3
        )

        if not text:
            full = _inner_text(it)
            if name and full.startswith(name):
                text = full[len(name):].strip()

        if name or text:
            out.append({"name": name, "text": text})
    return out


# ---------------- Single-page scrape ----------------
def scrape_detachment(driver, detachment_id: str) -> Dict[str, Any]:
    """
    Visit a detachment page and scrape: title, Rules, Enhancements, Stratagems.
    """
    url = DETACHMENT_URL.format(detachment_id)
    driver.get(url)

    # Wait for title
    try:
        h1 = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "main h1"))
        )
        detachment_name = h1.text.strip()
    except Exception:
        detachment_name = ""

    # Expand collapsibles to help lazy content render
    _click_all_collapsibles(driver)

    rules = _parse_rules(driver)
    enhancements = parse_enhancements(driver)
    stratagems = parse_stratagems(driver)

    return {
        "detachment_name": detachment_name,
        "detachment_id": detachment_id,
        "url": url,
        "rules": rules,
        "enhancements": enhancements,
        "stratagems": stratagems
    }


# ---------------- Orchestrator ----------------
def run_detachment_scrape(
        headless: bool = True,
        factions_path: str = FACTIONS_JSON,
        output_path: str = OUTPUT_JSON,
        force: bool = False
):
    factions = load_factions(factions_path)

    # Prepare list of all detachments with faction context
    all_detachments = []
    for f in factions:
        raw_name = f.get("faction", "")
        # strip common prefixes for a clean stored name
        clean_name = raw_name
        for prefix in ("Codex Supplement: ", "Codex: ", "Index: "):
            if clean_name.lower().startswith(prefix.lower()):
                clean_name = clean_name[len(prefix):]
        clean_name = clean_name.strip()

        for d in f.get("detachments", []):
            all_detachments.append({
                "faction_name": clean_name,
                "faction_id": f["faction_id"],
                "detachment_name": d["detachment_name"],
                "detachment_id": d["detachment_id"]
            })

    total = len(all_detachments)
    if total == 0:
        print("No detachments found in factions.json")
        return

    existing = load_existing_detachments(output_path)

    driver = chrome_driver(headless=headless)
    try:
        processed = 0
        for det in all_detachments:
            det_id = det["detachment_id"]

            # Skip if already scraped and not forcing
            if not force and det_id in existing:
                processed += 1
                remaining = total - processed
                print(f"[{processed}/{total}] (skip) {det['detachment_name']} ({det_id}) — remaining: {remaining}")
                continue

            try:
                data = scrape_detachment(driver, det_id)
                record = DetachmentRecord(
                    faction_name=det["faction_name"],
                    faction_id=det["faction_id"],
                    detachment_name=data["detachment_name"] or det["detachment_name"],
                    detachment_id=det_id,
                    url=data["url"],
                    rules=data["rules"],
                    enhancements=data["enhancements"],
                    stratagems=data["stratagems"]
                )
                existing[det_id] = asdict(record)

                # Save incrementally to avoid losing progress
                save_detachments(existing, output_path)

                processed += 1
                remaining = total - processed
                print(f"[{processed}/{total}] Parsed {record.detachment_name} ({det_id}) — remaining: {remaining}")

            except Exception as e:
                processed += 1
                remaining = total - processed
                print(f"[{processed}/{total}] ERROR {det['detachment_name']} ({det_id}) — {e} — remaining: {remaining}")

    finally:
        driver.quit()

    # Report missing
    scraped_ids = set(existing.keys())
    expected_ids = set(d["detachment_id"] for d in all_detachments)
    missing = sorted(list(expected_ids - scraped_ids))
    if missing:
        print("\nMissing detachments (not in detachment.json):")
        for mid in missing:
            nm = next((d["detachment_name"] for d in all_detachments if d["detachment_id"] == mid), mid)
            print(f" - {nm} ({mid})")
    else:
        print("\nAll detachments parsed and saved to detachment.json ✅")




# 1) Make sure factions.json is in the working directory.
# 2) Run the scraper:
run_detachment_scrape(
    headless=True,           # set False to watch the clicks
    factions_path="factions.json",
    output_path="detachment.json",
    force=False              # True = re-scrape everything from scratch
)

[1/161] Parsed Wrathful Procession (L0LbhQIXtOk) — remaining: 160
[2/161] Parsed Vindication Task Force (ntsshfM8xU0) — remaining: 159
[3/161] Parsed Godhammer Assault Force (pKBuSeHT2rs) — remaining: 158
[4/161] Parsed Companions of Vehemence (589yQh55Gt0) — remaining: 157
[5/161] Parsed The Lost Brethren (g_aEwJB0tlo) — remaining: 156
[6/161] Parsed The Angelic Host (GJZL4dXKNnE) — remaining: 155
[7/161] Parsed Liberator Assault Group (zZvCub3ckxw) — remaining: 154
[8/161] Parsed Angelic Inheritors (W_GWLukW_Fg) — remaining: 153
[9/161] Parsed Wrath of the Rock (WTntdvZmrzM) — remaining: 152
[10/161] Parsed Unforgiven Task Force (n9NoYWdMux4) — remaining: 151
[11/161] Parsed Lion’s Blade Task Force (qOP4UVb0zZQ) — remaining: 150
[12/161] Parsed Inner Circle Task Force (WoK_FkHfPZw) — remaining: 149
[13/161] Parsed Company of Hunters (eb3NjT4RSLo) — remaining: 148
[14/161] Parsed Saga of the Hunter (z0IkwaORndM) — remaining: 147
[15/161] Parsed Saga of the Bold (7wVoOdX3Z6Y) — remaini