In [5]:
!pip install playwright transformers sentence-transformers beautifulsoup4 lxml
!playwright install-deps
!playwright install

import torch
import requests
from bs4 import BeautifulSoup, Comment
from playwright.async_api import async_playwright
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from PIL import Image

# =========================
#   1) CAPTURAR PANTALLA
# =========================

async def capture_screenshot(url, path):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page(viewport={"width":1280, "height":800})
        await page.goto(url, wait_until="networkidle", timeout=30000)
        await page.screenshot(path=path, full_page=True)
        await browser.close()

# =========================
#   2) EMBEDDING VISUAL CLIP
# =========================

from transformers import CLIPProcessor, CLIPModel

_clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
_clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def embed_image_clip(path):
    image = Image.open(path).convert("RGB")
    inputs = _clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = _clip_model.get_image_features(**inputs)
    return outputs.squeeze()

# =========================
#   3) EMBEDDING DE TEXTO
# =========================

_text_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_visible_text(html):
    soup = BeautifulSoup(html, "lxml")
    for script in soup(["script", "style"]):
        script.extract()
    texts = soup.stripped_strings
    return " ".join(texts)

def embed_text_sentence_transformer(text):
    emb = _text_model.encode(text, convert_to_tensor=True)
    return emb

# ======================================================
#   4) EMBEDDING ESTRUCTURAL HTML - MarkupLM SIMPLIFICADO
# ======================================================

tokenizer = AutoTokenizer.from_pretrained("microsoft/markuplm-base")
markuplm = AutoModel.from_pretrained("microsoft/markuplm-base")

def extract_nodes(html):
    """
    Extrae nodos simples (solo tags) sin XPath real.
    MarkupLM funcionará pero en modo degradado.
    """
    soup = BeautifulSoup(html, "lxml")
    nodes = [tag.name for tag in soup.find_all()]
    xpaths = ["x"] * len(nodes)  # placeholder requerido por MarkupLM
    return nodes, xpaths

def embed_html_markuplm_from_url(url):
    html = requests.get(url, timeout=10).text
    nodes, xpaths = extract_nodes(html)

    if len(nodes) == 0:
        return torch.zeros(768)

    # MarkupLM espera listas de listas
    inputs = tokenizer(
        [nodes],
        xpaths=[xpaths],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    with torch.no_grad():
        out = markuplm(**inputs).last_hidden_state.mean(dim=1).squeeze()

    return out

# =========================
#   5) FUSIÓN HÍBRIDA
# =========================

def embed_site_from_screenshot(url, screenshot_path):
    print(f"Procesando {url}")

    # Visual
    emb_img = embed_image_clip(screenshot_path)

    # Texto
    html = requests.get(url, timeout=10).text
    text = extract_visible_text(html)
    emb_text = embed_text_sentence_transformer(text)

    # HTML estructural
    emb_html = embed_html_markuplm_from_url(url)

    # Fusionar (normalizado)
    emb_img = emb_img / emb_img.norm()
    emb_text = emb_text / emb_text.norm()
    emb_html = emb_html / emb_html.norm()

    fused = torch.cat([
        0.6 * emb_img,
        0.3 * emb_text,
        0.1 * emb_html
    ], dim=0)

    fused = fused / fused.norm()
    return fused, emb_img, emb_text, emb_html

# =========================
#   6) DEMO COMPLETA
# =========================

# capturar pantallas
await capture_screenshot("https://users.dcc.uchile.cl/~patorres/", "p1.png")
await capture_screenshot("http://example.com", "p2.png")

# embeddings
emb1, emb_img_1, emb_text_1, emb_html_1 = embed_site_from_screenshot("https://users.dcc.uchile.cl/~patorres/", "p1.png")
emb2, emb_img_2, emb_text_2, emb_html_2 = embed_site_from_screenshot("http://example.com", "p2.png")

# similitud
sim = torch.nn.functional.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
print("Similitud híbrida final:", sim)


Collecting playwright
  Downloading playwright-1.56.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.56.0-py3-none-manylinux1_x86_64.whl (46.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.56.0 pyee-13.0.0
Installing dependencies...
Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:2 https://cli.github.com/packages stable InRelease
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy In

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/810 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/277M [00:00<?, ?B/s]

Procesando https://users.dcc.uchile.cl/~patorres/
Procesando http://example.com
Similitud híbrida final: 0.27782896161079407


In [6]:
def normalize(x):
    return x / (x.norm() + 1e-12)

def similarity(a, b):
    return torch.nn.functional.cosine_similarity(
        a.unsqueeze(0), b.unsqueeze(0)
    ).item()

def explain_similarity(embA, embB):
    """
    embA y embB deben ser diccionarios:
    {
        "hyb": tensor,
        "img": tensor,
        "text": tensor,
        "html": tensor
    }
    """
    # Normalizar
    vA = normalize(embA["img"])
    tA = normalize(embA["text"])
    hA = normalize(embA["html"])

    vB = normalize(embB["img"])
    tB = normalize(embB["text"])
    hB = normalize(embB["html"])

    # Similitudes individuales
    sim_v = similarity(vA, vB)
    sim_t = similarity(tA, tB)
    sim_h = similarity(hA, hB)

    sim_final = similarity(embA["hyb"], embB["hyb"])

    # Reporte
    return {
        "sim_visual": sim_v,
        "sim_textual": sim_t,
        "sim_html": sim_h,
        "sim_global": sim_final
    }


In [7]:
emb_site1 = {
    "hyb": emb1,
    "img": emb_img_1,
    "text": emb_text_1,
    "html": emb_html_1
}

emb_site2 = {
    "hyb": emb2,
    "img": emb_img_2,
    "text": emb_text_2,
    "html": emb_html_2
}

report = explain_similarity(emb_site1, emb_site2)
report


{'sim_visual': 0.3249787390232086,
 'sim_textual': 0.01665947586297989,
 'sim_html': 0.9309614896774292,
 'sim_global': 0.27782896161079407}

In [8]:
# capturar pantallas
await capture_screenshot("https://users.dcc.uchile.cl/~patorres/", "p1.png")
await capture_screenshot("http://example.com", "p2.png")

# embeddings
emb1, emb_img_1, emb_text_1, emb_html_1 = embed_site_from_screenshot("https://users.dcc.uchile.cl/~patorres/", "p1.png")
emb2, emb_img_2, emb_text_2, emb_html_2 = embed_site_from_screenshot("http://example.com", "p2.png")

# similitud
sim = torch.nn.functional.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
print("Similitud híbrida final:", sim)


Procesando https://users.dcc.uchile.cl/~patorres/
Procesando http://example.com
Similitud híbrida final: 0.27898624539375305


In [9]:
!pip install ipywidgets --quiet
from IPython.display import display
import ipywidgets as widgets
import torch


# -----------------------------------------------------------
# CAPA EMBEDDINGS
# -----------------------------------------------------------
async def gen_embed(url):
  await capture_screenshot(url, f'{url}.png')
  return embed_site_from_screenshot(url, f'{url}.png')


# -----------------------------------------------------------
# WIDGETS
# -----------------------------------------------------------
url1_box = widgets.Text(
    value="https://example.com",
    placeholder="https://...",
    description="Sitio A:",
    layout=widgets.Layout(width='600px')
)

url2_box = widgets.Text(
    value="https://wikipedia.org",
    placeholder="https://...",
    description="Sitio B:",
    layout=widgets.Layout(width='600px')
)

btn_compare = widgets.Button(
    description="Comparar",
    button_style="primary"
)

output = widgets.Output()

# -----------------------------------------------------------
# LÓGICA DEL BOTÓN
# -----------------------------------------------------------
def on_compare_clicked(b):
    output.clear_output()

    url1 = url1_box.value
    url2 = url2_box.value

    with output:
        print("Procesando...\n")

        emb1 = gen_embed(url1)
        emb2 = gen_embed(url2)

        print("🔍 Resultados de similitud:\n")

        report = explain_similarity(emb_site1, emb_site2)

        print("SIM Visual:     ", round(report['sim_visual'],2))
        print("SIM Textual:  ", round(report['sim_textual'],2))
        print("SIM HTML:  ", round(report['sim_html'],2))
        print("SIM Total:   ", round(report['sim_global'],2))

btn_compare.on_click(on_compare_clicked)

# -----------------------------------------------------------
# MOSTRAR UI
# -----------------------------------------------------------
display(url1_box, url2_box, btn_compare, output)


Text(value='https://example.com', description='Sitio A:', layout=Layout(width='600px'), placeholder='https://.…

Text(value='https://wikipedia.org', description='Sitio B:', layout=Layout(width='600px'), placeholder='https:/…

Button(button_style='primary', description='Comparar', style=ButtonStyle())

Output()

  local_value = callback(*args, **kwargs)
