In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from selenium.webdriver.common.by import By

from visca.browser import (
    create_driver,
    ensure_page_loaded,
    capture_full_page_screenshot
)
from visca.element_extractor import (
    extract_elements_from_driver,
    save_elements
)
from visca.dedup import deduplicate_screenshots
from visca.virtual_node import (
    build_dom_tree,
    VirtualNode
)
from visca.segment import (
    tag_multiset,
    jaccard_distance,
    subtree_size,
    calculate_psi_avg,
    calculate_psi_sum,
    gather_instances,
    ascii_tree,
)

from visca.llm.gemini import create_model
from visca.prompts import (
    PAGE_CONTEXT_EXTRACTION_SYSTEM_PROMPT,
    CLASSIFICATION_AND_CONTEXT_PROMPT,
    # CONTEXTUAL_DESCRIPTION_PROMPT,
    COMPONENT_GENERATION_PROMPT,
    # LIST_COMPONENT_GENERATION_PROMPT
)
from visca.llm_processing import (
    _get_ancestor_context,
    classify_and_describe_candidates,
    transform_candidate
)

# Initialization

In [3]:
import time
import os
import io
import time
import json
from typing import Tuple
import json, textwrap, time, io
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from PIL import Image   # only if you want to display inline in a notebook

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.remote.webdriver import WebDriver
from selenium.webdriver.remote.webelement import WebElement

# DOM Element Extraction

In [4]:

def extraction(url: str, out_dir: str):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920,1080")
    chrome_options.add_argument("--hide-scrollbars")  # Hide scrollbars to avoid affecting layout
    chrome_options.add_argument("--force-device-scale-factor=1")  # Force known scale factor
    chrome_options.add_argument("--disable-gpu")
    chrome_options.page_load_strategy = "eager" # <- Eager loading 

    chrome_path = ChromeDriverManager().install()
    if "THIRD_PARTY_NOTICES.chromedriver" in chrome_path:
        chrome_path = chrome_path.replace("THIRD_PARTY_NOTICES.chromedriver", "chromedriver")
    os.chmod(chrome_path, 755)

    driver = Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )

    driver.get(url)

    time.sleep(5)
    driver.execute_script("window.stop();")
        
    dom_elements = extract_elements_from_driver(driver)

    dom_elements_with_screenshot = save_elements(
    driver=driver,
    result_dir=out_dir,
    dom_elements=dom_elements
    )

    dom_elements_with_screenshot = list(filter(lambda x: 'screenshot' in x, dom_elements_with_screenshot))

    deduplicated_elements = deduplicate_screenshots(dom_elements_with_screenshot)

    reduced_tree = build_dom_tree(deduplicated_elements)

    return reduced_tree, driver


# Segmentation

### Sum

In [5]:
def segmentation(reduced_tree: VirtualNode, out_dir: str):
    calculate_psi_sum(reduced_tree)                           # new size logic in use
    instances = gather_instances(reduced_tree)

    print("Number of Segments: ", len(instances.keys()))

    # Printing Segment Size
    # for xp, size in instances.items():
    #     print(f"{xp:<60}  subtree-nodes = {size}")

    with open(f'{out_dir}/segments.json','r',encoding='utf-8') as f:
        leaves = json.load(f)

    # group leaf XPaths under each root‑XPath
    instance_details = {}
    for root_xpath, count in instances.items():
        
        grouped = [
        l for l in leaves
        if l['xpath'] != root_xpath                 # not the root itself
        and l['xpath'].startswith(root_xpath + '/') # true descendants only
        ]

        instance_details[root_xpath] = {
            'count':  count,
            'leaves': [l['xpath'] for l in grouped]
        }

    with open(f"{out_dir}/segmentation_xpath_aa.json", 'w', encoding='utf-8') as out:
        json.dump(instance_details, out, indent=2, ensure_ascii=False)

# File Setup

In [6]:
from pathlib import Path
import re
from urllib.parse import urlparse

DATASET_ROOT = Path("evaluation/segmentation/datasets/dataset-popular")   
RESULTS_ROOT = Path("evaluation/segmentation/results")      
DATASET_MAPPING = DATASET_ROOT / "mapping.txt"              

# Regex pattern to extract key/value pairs
pattern = re.compile(r'^\s*"(?P<key>.*?)"\s*:\s*"(?P<value>.*?)"\s*,?$')

# Prefix to remove from each value
prefix_to_remove = '/opt/dataset-popular/'

# Function to normalize the key by stripping protocol and trailing slash
def normalize_key(raw_key: str) -> str:
    parsed = urlparse(raw_key)
    key = parsed.netloc + parsed.path
    return key.rstrip('/')

# Parse the file into a dictionary, trimming the prefix and normalizing keys
dataset = {}
with open(DATASET_MAPPING, 'r') as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        if line.startswith('#') or line.startswith(':'):
            line = line[1:].strip()
            if not line:
                continue
            
        m = pattern.match(line)
        if m:
            raw_key = m.group('key')
            raw_value = m.group('value')
            # Remove the prefix if present
            if raw_value.startswith(prefix_to_remove):
                raw_value = raw_value[len(prefix_to_remove):]
            # Normalize the key
            key = normalize_key(raw_key)
            dataset[key] = raw_value

print("Dataset dictionary created.")

for site_name, site_paths in dataset.items():            
    (RESULTS_ROOT / site_name).mkdir(parents=True, exist_ok=True)
    # print(f"Created/verified: {RESULTS_ROOT/site_name}")

print(f"\nAll {len(dataset.keys())} result folders are ready.")

In [7]:
RESULTS_ROOT = Path("evaluation/segmentation/results")      
TARGET_HTML = [
    # "abcnews.go.com",
    # "eu.real.com",
    "speedtest.net",
    "www.aaas.org",
    "www.alistapart.com",
    "www.alz.org",
    "www.amnesty.org",
    "www.break.com",
    "www.cato.org",
    "www.cpsc.gov",
    "www.drugs.com",
    "www.economist.com",
    "www.euronews.com",
    "www.factcheck.org",
    "www.fodors.com",
    "www.fda.gov",
    "www.foxnews.com",
    "www.freetranslation.com",
    "www.geocaching.com",
    "www.gnu.org",
    "www.house.gov",
    "www.irs.gov",
    "www.kbb.com",
    "www.lonelyplanet.com",
]

for webpage, html_path in dataset.items():
    if webpage in TARGET_HTML:

        # Change URL to Absolute Route of URL for Dataset Popular
        url = f"file:///Users/parsaalian/Desktop/Projects/research/auto-assert/evaluation/segmentation/datasets/dataset-popular/{webpage}/{webpage}/index.dom.html"
        dir = f"{RESULTS_ROOT}/{webpage}/llm"

        print(f"Processing {webpage} | {url} \n")

        # Extraction
        node, driver = extraction(url, dir)

        # Segmentation
        segmentation(node, dir)

        # Visualization
        screenshot = capture_full_page_screenshot(driver)
        screenshot.save(f'{dir}/screenshot.png')

        # Page Classification
        page_context_model = create_model(PAGE_CONTEXT_EXTRACTION_SYSTEM_PROMPT)
        page_context = page_context_model(file=f'{dir}/screenshot.png').text
        print(page_context)

        # Segmentation Classification
        classification_model = create_model(
        CLASSIFICATION_AND_CONTEXT_PROMPT,
        settings={
            'temperature': 0
        }
        )

        MEMORY = {}

        classified_tree, run_log = classify_and_describe_candidates(
            root=node,
            classification_model=classification_model,
            page_context=page_context,
            memory=MEMORY,
            segment_json_path= f"{dir}/segmentation_xpath_aa.json"
        )

        # write them out
        with open(f'{dir}/run_log.json', 'w', encoding='utf-8') as f:
            json.dump(run_log, f, indent=2, ensure_ascii=False)

        with open(f'{dir}/llm_evaluation.json', 'w', encoding='utf-8') as f:
            json.dump(classification_model.stats, f, indent=2, ensure_ascii=False)

        print("Wrote run_log.json and llm_evaluation.json")



KeyboardInterrupt: 