# Extract HTML Snippets for Segmentation Leaves
This notebook:
1. Loads **`segmentation.json`** (the XPath segmentation spec).
2. Parses **`page.html`**.
3. For every parent XPath and each of its leaf XPaths, serialises the matched HTML.
4. Dumps the result to **`extracted_segments.json`** in the required structure.

In [1]:
import json, os
from lxml import etree, html
from dotenv import load_dotenv

load_dotenv()
URL = os.getenv('URL')
print(URL)

FOLDER = URL.rstrip("/").replace("://","_").replace("/","_")

SEG_FILE = os.path.join(FOLDER, "segmentation.json") 
HTML_FILE = os.path.join(FOLDER, "page.html")  
OUTPUT_FILE = os.path.join(FOLDER, 'extracted_html_segments.json')

https://github.com/facebook


In [2]:
# Load segmentation spec
with open(SEG_FILE, 'r', encoding='utf-8') as f:
    segmentation = json.load(f)
print(f'Loaded {{len(segmentation)}} parent nodes from {SEG_FILE}')

Loaded {len(segmentation)} parent nodes from https_github.com_facebook/segmentation.json


In [3]:
# Parse HTML page
with open(HTML_FILE, 'r', encoding='utf-8') as f:
    page_text = f.read()
tree = html.fromstring(page_text)
print('HTML parsed successfully')

HTML parsed successfully


In [4]:
# 3) Extraction helpers & main loop
def serialize(node):
    return etree.tostring(node, pretty_print=True, method='html', encoding='unicode')

output = {}

for parent_xpath, info in segmentation.items():
    parent_elem = tree.xpath(parent_xpath)
    parent_html = serialize(parent_elem[0]) if parent_elem else None

    raw_leaves = sorted(info.get('leaves', []), key=len)   # shortest first
    outer_leaves = []
    for p in raw_leaves:
        if not any(p.startswith(ol + '/') for ol in outer_leaves):
            outer_leaves.append(p)

    leaves_serialised = []
    for leaf_xpath in outer_leaves:
        leaf_elem = tree.xpath(leaf_xpath)
        # strip common prefix to shorten
        rel_xpath = leaf_xpath[len(parent_xpath):] if leaf_xpath.startswith(parent_xpath) else leaf_xpath
        if rel_xpath.startswith('/'):
            rel_xpath = rel_xpath[1:]
        leaves_serialised.append({
            'xpath': rel_xpath,
            'html': serialize(leaf_elem[0]) if leaf_elem else None
        })

    output[parent_xpath] = {
        #'parent_html': parent_html,
        'leaves': leaves_serialised
    }

print('Extraction complete')

Extraction complete


In [5]:
# Dump results to JSON
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=2, ensure_ascii=False)
print(f'Wrote extracted data to {OUTPUT_FILE}')

Wrote extracted data to https_github.com_facebook/extracted_html_segments.json
