In [2]:
!pip install playwright transformers sentence-transformers beautifulsoup4 lxml
!playwright install-deps
!playwright install

import torch
import requests
from bs4 import BeautifulSoup, Comment
from playwright.async_api import async_playwright
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from PIL import Image

# =========================
#   1) CAPTURAR PANTALLA
# =========================

async def capture_screenshot(url, path):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page(viewport={"width":1280, "height":800})
        await page.goto(url, wait_until="networkidle", timeout=30000)
        await page.screenshot(path=path, full_page=True)
        await browser.close()

# =========================
#   2) EMBEDDING VISUAL CLIP
# =========================

from transformers import CLIPProcessor, CLIPModel

_clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
_clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def embed_image_clip(path):
    image = Image.open(path).convert("RGB")
    inputs = _clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = _clip_model.get_image_features(**inputs)
    return outputs.squeeze()

# =========================
#   3) EMBEDDING DE TEXTO
# =========================

_text_model = SentenceTransformer("all-MiniLM-L6-v2")

def extract_visible_text(html):
    soup = BeautifulSoup(html, "lxml")
    for script in soup(["script", "style"]):
        script.extract()
    texts = soup.stripped_strings
    return " ".join(texts)

def embed_text_sentence_transformer(text):
    emb = _text_model.encode(text, convert_to_tensor=True)
    return emb

# ======================================================
#   4) EMBEDDING ESTRUCTURAL HTML - MarkupLM SIMPLIFICADO
# ======================================================

tokenizer = AutoTokenizer.from_pretrained("microsoft/markuplm-base")
markuplm = AutoModel.from_pretrained("microsoft/markuplm-base")

def extract_nodes(html):
    """
    Extrae nodos simples (solo tags) sin XPath real.
    MarkupLM funcionará pero en modo degradado.
    """
    soup = BeautifulSoup(html, "lxml")
    nodes = [tag.name for tag in soup.find_all()]
    xpaths = ["x"] * len(nodes)  # placeholder requerido por MarkupLM
    return nodes, xpaths

def embed_html_markuplm_from_url(url):
    html = requests.get(url, timeout=10).text
    nodes, xpaths = extract_nodes(html)

    if len(nodes) == 0:
        return torch.zeros(768)

    # MarkupLM espera listas de listas
    inputs = tokenizer(
        [nodes],
        xpaths=[xpaths],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    with torch.no_grad():
        out = markuplm(**inputs).last_hidden_state.mean(dim=1).squeeze()

    return out

# =========================
#   5) FUSIÓN HÍBRIDA
# =========================

def embed_site_from_screenshot(url, screenshot_path):
    print(f"Procesando {url}")

    # Visual
    emb_img = embed_image_clip(screenshot_path)

    # Texto
    html = requests.get(url, timeout=10).text
    text = extract_visible_text(html)
    emb_text = embed_text_sentence_transformer(text)

    # HTML estructural
    emb_html = embed_html_markuplm_from_url(url)

    # Fusionar (normalizado)
    emb_img = emb_img / emb_img.norm()
    emb_text = emb_text / emb_text.norm()
    emb_html = emb_html / emb_html.norm()

    fused = torch.cat([
        0.6 * emb_img,
        0.3 * emb_text,
        0.1 * emb_html
    ], dim=0)

    fused = fused / fused.norm()
    return fused, emb_img, emb_text, emb_html

# =========================
#   6) DEMO COMPLETA
# =========================

# capturar pantallas
await capture_screenshot("https://users.dcc.uchile.cl/~patorres/", "p1.png")
await capture_screenshot("http://example.com", "p2.png")

# embeddings
emb1, emb_img_1, emb_text_1, emb_html_1 = embed_site_from_screenshot("https://users.dcc.uchile.cl/~patorres/", "p1.png")
emb2, emb_img_2, emb_text_2, emb_html_2 = embed_site_from_screenshot("http://example.com", "p2.png")

# similitud
sim = torch.nn.functional.cosine_similarity(emb1.unsqueeze(0), emb2.unsqueeze(0)).item()
print("Similitud híbrida final:", sim)


Installing dependencies...
Hit:1 https://cli.github.com/packages stable InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.4 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:12 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,153 kB]
Hit:13 https://ppa.launchpad

In [3]:
def normalize(x):
    return x / (x.norm() + 1e-12)

def similarity(a, b):
    return torch.nn.functional.cosine_similarity(
        a.unsqueeze(0), b.unsqueeze(0)
    ).item()

def explain_similarity(embA, embB):
    """
    embA y embB deben ser diccionarios:
    {
        "hyb": tensor,
        "img": tensor,
        "text": tensor,
        "html": tensor
    }
    """
    # Normalizar
    vA = normalize(embA["img"])
    tA = normalize(embA["text"])
    hA = normalize(embA["html"])

    vB = normalize(embB["img"])
    tB = normalize(embB["text"])
    hB = normalize(embB["html"])

    # Similitudes individuales
    sim_v = similarity(vA, vB)
    sim_t = similarity(tA, tB)
    sim_h = similarity(hA, hB)

    sim_final = similarity(embA["hyb"], embB["hyb"])

    # Reporte
    return {
        "sim_visual": sim_v,
        "sim_textual": sim_t,
        "sim_html": sim_h,
        "sim_global": sim_final
    }


In [4]:
emb_site1 = {
    "hyb": emb1,
    "img": emb_img_1,
    "text": emb_text_1,
    "html": emb_html_1
}

emb_site2 = {
    "hyb": emb2,
    "img": emb_img_2,
    "text": emb_text_2,
    "html": emb_html_2
}

report = explain_similarity(emb_site1, emb_site2)
report


{'sim_visual': 0.33429020643234253,
 'sim_textual': 0.01665947586297989,
 'sim_html': 0.9309614896774292,
 'sim_global': 0.28511616587638855}