In [None]:
import time
import json
from pathlib import Path
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from PIL import Image
import shutil, pathlib
import json, io, textwrap

URL = 'https://github.com/facebook'

folder_name = URL.rstrip("/").replace("://","_").replace("/","_")
out_dir = pathlib.Path(folder_name)

# --- nuke existing contents, keep the folder itself ---
if out_dir.exists():
    shutil.rmtree(out_dir)   
out_dir.mkdir(parents=True, exist_ok=True)

In [None]:
def vips(pDoc: int, webpage: str, out_dir: str):
    chrome_options = Options()
    chrome_options.set_capability('goog:loggingPrefs', {'browser': 'ALL'})
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--hide-scrollbars")                # Hide scrollbars to avoid affecting layout
    # chrome_options.add_argument("--window-size=1200,830")
    # chrome_options.add_argument("--force-device-scale-factor=1")  # Force known scale factor
    # chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--allow-file-access-from-files")   # local files
    chrome_options.add_argument("--disable-web-security")           # (CORS for <file://>)

    driver = Chrome(
        service=Service(ChromeDriverManager().install()),
        options=chrome_options
    )

    # Change input path to include file://
    if not webpage.startswith(("http://", "https://", "file://")):
        webpage = Path(webpage).expanduser().resolve().as_uri()

    driver.get(webpage)

    w = driver.execute_script("return document.body.scrollWidth")
    h = driver.execute_script("return document.body.scrollHeight")
    driver.set_window_size(w, h)

    # Wait for document ready state
    WebDriverWait(driver, 10).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )

    # Wait a bit more for any JS-triggered rendering to complete
    time.sleep(2)

    vips_js = None

    with open('vips-es6.js', 'r', encoding='utf-8') as f:
        vips_js = f.read()

    try:
        output = driver.execute_script(f'''
        {vips_js}
        const tester = new VipsTester();
        return tester.main('output.json', {pDoc});
        ''')
        
        print(output)
        
        # Normal path: write the segmentation JSON to disk
        with open(f'{out_dir}/output_{pDoc}.json', 'w', encoding='utf-8') as f:
            json.dump(json.loads(output), f)
        # Give a very brief moment for logs to process if needed (often not required)
        # time.sleep(0.5)

        # 4. Retrieve the logs
        # 'browser' corresponds to the key used in loggingPrefs
        logs = driver.get_log('browser')

        print("\n--- Captured Browser Logs ---")
        if logs:
            for log_entry in logs:
                # log_entry is a dictionary like:
                # {'level': 'INFO', 'message': '...', 'source': 'console-api', 'timestamp': 167...}
                print(f"Level: {log_entry['level']}, Message: {log_entry['message']}")
                # print(log_entry) # Print the full entry if you want more details
        else:
            print("No browser logs captured.")
        print("---------------------------\n")

    except Exception as e:
        print(f"An error occurred: {e}")

    # Load VIPS Segmentation
    with open(f'{out_dir}/output_{pDoc}.json') as f:
        vips = json.load(f)["segmentations"]["vips"]

    # Compute full page size in CSS px from the polygons themselves
    all_pts = [pt
            for mps in vips
            for mp  in mps
            for poly in mp
            for pt  in poly]

    page_width  = max(p[0] for p in all_pts)
    page_height = max(p[1] for p in all_pts)

    driver.set_window_size(page_width, page_height)   # exact CSS space

    # Inject overlay <div>s directly into the DOM 
    vips_js_payload = textwrap.dedent(f"""
        (function() {{
        const vips = {json.dumps(vips)};

        /* add CSS once */
        const style = document.createElement('style');
        style.textContent = `
            .vips-overlay {{
            position:absolute;
            background:rgba(0,128,255,.25);  /* 25 % opacity blue */
            outline:2px solid red;
            z-index:2147483647;
            pointer-events:none;
            }}`;
        document.head.appendChild(style);

        /* draw each VIPS block as an overlay */
        vips.forEach(mps =>
            mps.forEach(mp =>
            mp.forEach(poly => {{
                const xs = poly.map(p => p[0]);
                const ys = poly.map(p => p[1]);
                const box = document.createElement('div');
                box.className = 'vips-overlay';
                box.style.left   = Math.min(...xs) + 'px';
                box.style.top    = Math.min(...ys) + 'px';
                box.style.width  = (Math.max(...xs) - Math.min(...xs)) + 'px';
                box.style.height = (Math.max(...ys) - Math.min(...ys)) + 'px';
                document.body.appendChild(box);
            }})));
        }})();""")

    driver.execute_script(vips_js_payload)

    # Capture the screenshot
    png = driver.get_screenshot_as_png()
    with open(f'{out_dir}/boxed_{pDoc}.png', "wb") as fh:
        fh.write(png)

    #  Show it in-line
    # display(Image.open(io.BytesIO(png)))

    driver.quit()


### Scripts

In [14]:
vips(1, 
     "evaluation/segmentation/datasets/dataset-popular/abcnews.go.com/abcnews.go.com/index.dom.html", 
     "datasets/dataset-popular/abcnews.go.com")

{"id":"output.json","height":236,"width":1184,"segmentations":{"vips":[[[[[0,0],[0,178],[1184,178],[1184,0],[0,0]]]],[[[[0,198],[0,236],[1184,236],[1184,198],[0,198]]]]]}}
An error occurred: [Errno 2] No such file or directory: 'datasets/dataset-popular/abcnews.go.com/output_1.json'


FileNotFoundError: [Errno 2] No such file or directory: 'datasets/dataset-popular/abcnews.go.com/output_1.json'

In [None]:
for i in range(1, 12):
    vips(i, URL, out_dir)

NameError: name 'vips' is not defined

In [None]:
for i in range(1,12):
    with open(f'{out_dir}/output_{i}.json') as f:
        vips = json.load(f)["segmentations"]["vips"]
        print(len(vips))

6
6
6
6
6
6
14
84
86
136
136
