In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from visca.browser import (
    create_driver,
    ensure_page_loaded,
    capture_full_page_screenshot
)
from visca.element_extractor import (
    extract_elements_from_driver,
    save_elements
)
from visca.dedup import deduplicate_screenshots
from visca.virtual_node import (
    VirtualNode,
    build_dom_tree
)
from visca.segment import (
    tag_multiset,
    jaccard_distance,
    subtree_size,
    calculate_psi_avg,
    calculate_psi_sum,
    gather_instances,
    ascii_tree,
)
# from visca.llm.gemini import create_model
# from visca.prompts import (
#     PAGE_CONTEXT_EXTRACTION_SYSTEM_PROMPT,
#     CLASSIFICATION_PROMPT,
#     # CONTEXTUAL_DESCRIPTION_PROMPT,
#     COMPONENT_GENERATION_PROMPT,
#     LIST_COMPONENT_GENERATION_PROMPT
# )
# from visca.llm_processing import (
#     _get_ancestor_context,
#     classify_and_describe_candidates,
#     # extract_candidates_context,
#     transform_candidate
# )

# Initialization

In [3]:
import time
import os
import io
import time
import json
from typing import Tuple
from pathlib import Path
import shutil

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement

# DOM Element Extraction

In [4]:
def extraction(url: str, out_dir: str):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--hide-scrollbars")  # Hide scrollbars to avoid affecting layout
    chrome_options.add_argument("--force-device-scale-factor=1")  # Force known scale factor
    chrome_options.add_argument("--disable-gpu")
    chrome_options.page_load_strategy = "eager" # <- Eager loading 

    chrome_path = ChromeDriverManager().install()
    if "THIRD_PARTY_NOTICES.chromedriver" in chrome_path:
        chrome_path = chrome_path.replace("THIRD_PARTY_NOTICES.chromedriver", "chromedriver")
    os.chmod(chrome_path, 755)

    driver = Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )

    driver.get(url)

    time.sleep(5)
    driver.execute_script("window.stop();")
        
    dom_elements = extract_elements_from_driver(driver)

    dom_elements_with_screenshot = save_elements(
    driver=driver,
    result_dir=out_dir,
    dom_elements=dom_elements
    )

    dom_elements_with_screenshot = list(filter(lambda x: 'screenshot' in x, dom_elements_with_screenshot))

    deduplicated_elements = deduplicate_screenshots(dom_elements_with_screenshot)

    reduced_tree = build_dom_tree(deduplicated_elements)

    return reduced_tree

# Main

In [5]:
DATASET_ROOT = Path("evaluation/segmentation/datasets/dataset-popular")   # where the sites live
RESULTS_ROOT = Path("evaluation/segmentation/results")                    # where we’ll mirror them

URL = 'file:////Users/martintang/Desktop/Github/auto-assert/evaluation/segmentation/datasets/dataset-popular/eu.real.com/eu.real.com/index.blocks.html'
DIR = 'evaluation/segmentation/results/eu.real.com/ground-truth'

html_paths = {}

try:
    site_dirs = [p for p in DATASET_ROOT.expanduser().iterdir() if p.is_dir()]
except (FileNotFoundError, PermissionError, OSError) as e:
    raise SystemExit(f"Cannot read dataset directory: {e}")

for site_path in site_dirs:
    site_name = site_path.name                 
    (RESULTS_ROOT / site_name).mkdir(exist_ok=True)
    html_paths[site_name] = f"{site_name}/{site_name}/index.blocks.html"
    print(f"Created/verified: {RESULTS_ROOT/site_name}")

print("\nAll result folders are ready.")

Created/verified: evaluation/segmentation/results/www.alistapart.com
Created/verified: evaluation/segmentation/results/www.economist.com
Created/verified: evaluation/segmentation/results/www.findlaw.com
Created/verified: evaluation/segmentation/results/www.irs.gov
Created/verified: evaluation/segmentation/results/www.osha.gov
Created/verified: evaluation/segmentation/results/www.geocaching.com
Created/verified: evaluation/segmentation/results/www.break.com
Created/verified: evaluation/segmentation/results/www.openoffice.org
Created/verified: evaluation/segmentation/results/www.spiegel.de
Created/verified: evaluation/segmentation/results/www.nlm.nih.gov
Created/verified: evaluation/segmentation/results/speedtest.net
Created/verified: evaluation/segmentation/results/www.who.int
Created/verified: evaluation/segmentation/results/www.reuters.com
Created/verified: evaluation/segmentation/results/www.freetranslation.com
Created/verified: evaluation/segmentation/results/www.heritage.org
Create

In [6]:
import json
from pathlib import Path
import argparse
from typing import List, Dict, Any


def load_elements(path: str | Path) -> List[Dict[str, Any]]:
    """Read the flat element list produced by your extractor."""
    with open(path, encoding="utf-8") as fh:
        return json.load(fh)


def is_descendant(child_xpath: str, ancestor_xpath: str) -> bool:
    """True iff *child_xpath* is strictly inside *ancestor_xpath*."""
    return child_xpath.startswith(ancestor_xpath + "/")


def build_segments(elements: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Group elements by gt_dataBlock; for each segment root collect its leaves.
    """
    # 1. roots = elements that **define** a segment
    roots = [el for el in elements if el.get("gt_dataBlock") is not None]

    # 2. For quick membership tests we keep the full list in memory
    segments_out = {}

    for root in roots:
        root_xpath = root["xpath"]

        # ── all descendants (depth ≥ 1) ──────────────────────────────
        descendants = [el for el in elements
                       if is_descendant(el["xpath"], root_xpath)]

        # ── a leaf has *no* descendant of its own inside the segment ─
        leaves = []
        for cand in descendants:
            cand_xpath = cand["xpath"]
            has_child = any(
                is_descendant(other["xpath"], cand_xpath)
                for other in descendants
                if other is not cand
            )
            if not has_child:
                leaves.append(cand_xpath)

        # ── record ───────────────────────────────────────────────────
        segments_out[root_xpath] = {
            "count": len(leaves),
            "dataBlock": root["gt_dataBlock"],
            "dataBlockType": root["gt_dataBlockType"],
            "leaves": leaves,
        }

    return segments_out


# Visualize

In [7]:
import json, textwrap, time, io
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from PIL import Image   # only if you want to display inline in a notebook

In [8]:
def export_parent_bboxes(
    url: str,
    seg_json_path: str,
    out_json_path: str = "output.json",
    delay: float = 5.0,         # allow SPA hydration etc.
):
    """
    Build an “auto-assert” segmentation file that contains one polygon
    (= the left/top/right/bottom edges) for every **parent** segment
    found in *seg_json_path*.

    Output schema
    -------------
    {
    "id": "<file name you chose>",
    "height": <full-page CSS px>,
    "width":  <full-page CSS px>,
    "number_of_segments": <int>,
    "segmentations": {
        "auto-assert": [
        [ [ [ [x, y], [x, y], [x, y], [x, y] ] ] ],   # segment-1 polygon
        ...
        ]
    }
    }
    """
    # ───────────────────────────────────
    # 1)  Launch Chrome & open page
    # ───────────────────────────────────
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--hide-scrollbars")
    chrome_options.add_argument("--force-device-scale-factor=1")
    chrome_options.page_load_strategy = "eager"

    chromedriver_path = ChromeDriverManager().install()
    if chromedriver_path.endswith("THIRD_PARTY_NOTICES.chromedriver"):
        chromedriver_path = chromedriver_path.replace(
            "THIRD_PARTY_NOTICES.chromedriver", "chromedriver"
        )
    os.chmod(chromedriver_path, 0o755)

    driver = Chrome(service=Service(chromedriver_path), options=chrome_options)
    driver.get(url)

    # let scripts, fonts, etc. settle
    time.sleep(delay)
    driver.execute_script("window.stop();")

    # page dimensions in CSS px (same technique you used)
    width = driver.execute_script(
        "return Math.max(document.documentElement.scrollWidth, document.body.scrollWidth);"
    )
    height = driver.execute_script(
        "return Math.max(document.documentElement.scrollHeight, document.body.scrollHeight);"
    )
    driver.set_window_size(width, height)

    # ───────────────────────────────────
    # 2)  Load segmentation file
    # ───────────────────────────────────
    with open(seg_json_path, encoding="utf-8") as f:
        seg_raw = json.load(f)           # parent_xpath → {count, leaves, …}

    parent_xpaths = list(seg_raw.keys())

    # ───────────────────────────────────
    # 3)  Grab each parent’s bounding box
    # ───────────────────────────────────
    def get_box(xp):
        # JavaScript helper returns dict {left, top, right, bottom}
        return driver.execute_script(
            textwrap.dedent(
                """
                const xp = arguments[0];
                const el = document.evaluate(
                    xp, document, null,
                    XPathResult.FIRST_ORDERED_NODE_TYPE, null
                ).singleNodeValue;
                if (!el) return null;
                const r = el.getBoundingClientRect();
                return {
                left:   Math.round(r.left  + window.scrollX),
                top:    Math.round(r.top   + window.scrollY),
                right:  Math.round(r.right + window.scrollX),
                bottom: Math.round(r.bottom+ window.scrollY)
                };
                """
            ),
            xp,
        )

    polygons = []
    for xp in parent_xpaths:
        box = get_box(xp)
        if box is None:
            # element disappeared; skip gracefully
            continue

        # Clock-wise polygon: TL → BL → BR → TR
        poly = [
            [box["left"],  box["top"]],
            [box["left"],  box["bottom"]],
            [box["right"], box["bottom"]],
            [box["right"], box["top"]],
        ]
        polygons.append([[ [ poly ] ]])   # → [[[ [x,y] … ]]] nesting like your example

    driver.quit()

    # ───────────────────────────────────
    # 4)  Assemble & save result JSON
    # ───────────────────────────────────
    out = {
        "id": Path(out_json_path).name,
        "height": height,
        "width":  width,
        "number_of_segments": len(polygons),
        "segmentations": {
            "auto-assert": polygons
        }
    }

    Path(out_json_path).write_text(json.dumps(out, indent=2))
    print(f"Wrote {len(polygons)} segments → {out_json_path}")

    return out_json_path

In [9]:
def screenshot_segments(
    url: str,
    seg_json_path: str,
    out_dir: str = "results",
    fname: str = "segmented.png",
    delay: float = 5,              # wait for SPA hydration etc.
):
    """Load segmentation JSON, overlay parent/leaf boxes in the browser,
       and write a PNG screenshot to *out_dir/fname*.
    """
    # ───────────────────────────────────
    # 1)  Launch Chrome
    # ───────────────────────────────────
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    # chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--hide-scrollbars")  # Hide scrollbars to avoid affecting layout
    chrome_options.add_argument("--force-device-scale-factor=1")  # Force known scale factor
    chrome_options.add_argument("--disable-gpu")
    chrome_options.page_load_strategy = "eager" # <- Eager loading 

    chrome_path = ChromeDriverManager().install()
    if "THIRD_PARTY_NOTICES.chromedriver" in chrome_path:
        chrome_path = chrome_path.replace("THIRD_PARTY_NOTICES.chromedriver", "chromedriver")
    os.chmod(chrome_path, 755)

    driver = Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )

    driver.get(url)


    w = driver.execute_script("return document.body.scrollWidth")
    h = driver.execute_script("return document.body.scrollHeight")
    driver.set_window_size(w, h)

    time.sleep(delay)
    driver.execute_script("window.stop();")

    # ───────────────────────────────────
    # 2)  Load your segmentation
    # ───────────────────────────────────
    with open(seg_json_path, encoding="utf-8") as f:
        seg = json.load(f)           # dict[parent_xpath] → {"count":…, "leaves":[…]}

    # Full scrollable page size (CSS px) so the screenshot isn’t clipped
    page_w = driver.execute_script(
        "return Math.max(document.documentElement.scrollWidth, document.body.scrollWidth);")
    page_h = driver.execute_script(
        "return Math.max(document.documentElement.scrollHeight, document.body.scrollHeight);")
    driver.set_window_size(page_w, page_h)

    # ───────────────────────────────────
    # 3)  Build & inject the overlay script
    # ───────────────────────────────────
    overlay_js = textwrap.dedent(f"""
  (function drawSegmentOverlays(segments) {{
    const node = (xp) => document.evaluate(
      xp, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null
    ).singleNodeValue;

    /* ---------- overlay cosmetics ---------- */
    const style = document.createElement('style');
    style.textContent = `
      .overlay-parent,
      .overlay-leaf {{
        position:absolute;
        z-index:2147483647;
        pointer-events:none;
      }}
      /* parent → translucent RED with bold outline */
      .overlay-parent {{
        background:rgba(255,0,0,.25);
        outline:2px solid red;
      }}
      /* leaf → translucent GREEN, **no outline**  */
      .overlay-leaf {{
        background:rgba(0,255,0,.25);
      }}`;
    document.head.appendChild(style);

    /**
     * Draw a rectangle; if inset>0, shrink it on every side
     * so the parent’s red outline isn’t hidden.
     */
    const draw = (rect, cls, inset = 0) => {{
      const d = document.createElement('div');
      d.className = cls;
      d.style.left   = (rect.left  + window.scrollX + inset) + 'px';
      d.style.top    = (rect.top   + window.scrollY + inset) + 'px';
      d.style.width  = Math.max(0, rect.width  - inset*2) + 'px';
      d.style.height = Math.max(0, rect.height - inset*2) + 'px';
      document.body.appendChild(d);
    }};

    Object.entries(segments).forEach(([parentXP, info]) => {{
      const p = node(parentXP);
      if (p) draw(p.getBoundingClientRect(), 'overlay-parent', 0);   // no inset
      (info.leaves || []).forEach(leafXP => {{
        const l = node(leafXP);
        if (l) draw(l.getBoundingClientRect(), 'overlay-leaf', 1);  // 1-px inset
      }});
    }});
  }})(JSON.parse({json.dumps(json.dumps(seg))}));
""")


    driver.execute_script(overlay_js)

    # ───────────────────────────────────
    # 4)  Screenshot
    # ───────────────────────────────────
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    png = driver.get_screenshot_as_png()
    with open(Path(out_dir) / fname, "wb") as fh:
        fh.write(png)

    # Optional inline preview (e.g. in a Jupyter notebook)
    # display(Image.open(io.BytesIO(png)))

    driver.quit()
    print(f"Screenshot written → {Path(out_dir)/fname}")


# Batch Evaluation

In [10]:
from pathlib import Path
import re
from urllib.parse import urlparse

DATASET_ROOT = Path("evaluation/segmentation/datasets/dataset-popular")   
RESULTS_ROOT = Path("evaluation/segmentation/results")      
DATASET_MAPPING = DATASET_ROOT / "mapping.txt"              

# Regex pattern to extract key/value pairs
pattern = re.compile(r'^\s*"(?P<key>.*?)"\s*:\s*"(?P<value>.*?)"\s*,?$')

# Prefix to remove from each value
prefix_to_remove = '/opt/dataset-popular/'

# Function to normalize the key by stripping protocol and trailing slash
def normalize_key(raw_key: str) -> str:
    parsed = urlparse(raw_key)
    key = parsed.netloc + parsed.path
    return key.rstrip('/')

# Parse the file into a dictionary, trimming the prefix and normalizing keys
dataset = {}
with open(DATASET_MAPPING, 'r') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        if line.startswith('#') or line.startswith(':'):
            line = line[1:].strip()
            if not line:
                continue
            
        m = pattern.match(line)
        if m:
            raw_key = m.group('key')
            raw_value = m.group('value')
            # Remove the prefix if present
            if raw_value.startswith(prefix_to_remove):
                raw_value = raw_value[len(prefix_to_remove):]
            # Normalize the key
            key = normalize_key(raw_key)
            dataset[key] = raw_value

print("Dataset dictionary created.")

for site_name, site_paths in dataset.items():            
    (RESULTS_ROOT / site_name).mkdir(parents=True, exist_ok=True)
    # print(f"Created/verified: {RESULTS_ROOT/site_name}")

print(f"\nAll {len(dataset.keys())} result folders are ready.")

Dataset dictionary created.

All 70 result folders are ready.


In [11]:
success = 0

for webpage, html_path in dataset.items():
    if webpage == "eu.real.com":
        url = f"file:///Users/martintang/Desktop/Github/auto-assert/evaluation/segmentation/datasets/dataset-popular/{webpage}/{webpage}/index.blocks.html"
        out_dir = RESULTS_ROOT / webpage / "ground-truth"
        out_dir.mkdir(parents=True, exist_ok=True)
        print(f"\nProcessing {webpage!r} → {out_dir}")

        # 1) extraction
        try:
            node = extraction(url, str(out_dir))
        except Exception as e:
            print(f"[ERROR] extraction failed for {webpage!r}: {e}")
            continue

        # 2) build and save mapping
        try:
            json_path = out_dir / "segments.json"
            elements = load_elements(str(json_path))
            mapping = build_segments(elements)
            output_path = out_dir / "segmentation_xpath_gt.json"
            with open(output_path, "w", encoding="utf-8") as fh:
                json.dump(mapping, fh, indent=2)
        except Exception as e:
            print(f"[WARN] building/saving segments failed for {webpage!r}: {e}")

        # 3) screenshot
        try:
            screenshot_segments(
                url=url,
                seg_json_path=str(out_dir / "segmentation_xpath_gt.json"),
                out_dir=str(out_dir),
                fname="boxed.gt.png",
            )
        except Exception as e:
            print(f"[WARN] screenshot_segments failed for {webpage!r}: {e}")

        # 4) export bboxes
        try:
            export_parent_bboxes(
                url=url,
                seg_json_path=str(out_dir / "segmentation_xpath_gt.json"),
                out_json_path=str(out_dir / "segmentation_bbox_gt.json"),
                delay=5.0,
            )
        except Exception as e:
            print(f"[WARN] export_parent_bboxes failed for {webpage!r}: {e}")

        success += 1

print(f"\nGround Truth Segmentation ran on {success} webpages.")



Processing 'eu.real.com' → evaluation/segmentation/results/eu.real.com/ground-truth
Processing file:///Users/martintang/Desktop/Github/auto-assert/evaluation/segmentation/datasets/dataset-popular/eu.real.com/eu.real.com/index.blocks.html...
Element screenshots saved to evaluation/segmentation/results/eu.real.com/ground-truth
Computing image hashes for 140 segments...
Found 6 exact hash duplicates
Found 11 padding duplicates
Removing 17 duplicate screenshots...
Deduplication complete. Kept 123 of 140 segments.
Screenshot written → evaluation/segmentation/results/eu.real.com/ground-truth/boxed.gt.png
Wrote 11 segments → evaluation/segmentation/results/eu.real.com/ground-truth/segmentation_bbox_gt.json

Ground Truth Segmentation ran on 1 webpages.
