In [2]:
from pathlib import Path
from rapidfuzz import fuzz

page_sep = '<><><><><><>NEWPAGE<><><><><><>'

def load_mmd(path: str | Path,
             page_sep: str = page_sep):
    """Load groundtruth mmd of test_pdf.pdf & split by page
    """
    s = Path(path).read_text(encoding="utf-8")

    # normalize
    s = s.replace("\r\n", "\n").replace("\r", "\n").replace("\\n", "\n")

    # page split
    pages = [p.strip("\n") for p in s.split(page_sep)]

    return pages

# load
pages_base = load_mmd("./data/nougat_base.mmd", page_sep)
pages_small = load_mmd("./data/nougat_small.mmd", page_sep)


In [3]:
len(pages_base)

20

In [29]:
for i in range(len(pages_base)):

    similarity_score = fuzz.ratio(pages_base[i], pages_small[i])

    print(f"{i} simScore={similarity_score:.4f}")

0 simScore=91.9328
1 simScore=99.5202
2 simScore=99.5558
3 simScore=99.8870
4 simScore=99.0998
5 simScore=99.8329
6 simScore=100.0000
7 simScore=100.0000
8 simScore=99.9602
9 simScore=100.0000
10 simScore=99.6243
11 simScore=99.8723
12 simScore=100.0000
13 simScore=99.8680
14 simScore=98.5988
15 simScore=98.4100
16 simScore=99.9182
17 simScore=100.0000
18 simScore=99.9659
19 simScore=99.7521


In [17]:
full_text_base[i]==full_text_small[i]

False

In [22]:
from __future__ import annotations
from dataclasses import dataclass
from collections import Counter
from typing import Dict, List
from difflib import SequenceMatcher
import math
import re

# ---------- text normalization ----------
_ws_re = re.compile(r"\s+")
def normalize(s: str) -> str:
    # Keep math symbols, just standardize whitespace + lowercase
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = _ws_re.sub(" ", s).strip()
    return s.lower()

def tokens(s: str) -> List[str]:
    return normalize(s).split()

# ---------- metrics ----------
def seqmatch_ratio(a: str, b: str) -> float:
    return SequenceMatcher(None, normalize(a), normalize(b)).ratio()

def jaccard_tokens(a: str, b: str) -> float:
    A, B = set(tokens(a)), set(tokens(b))
    if not A and not B:
        return 1.0
    inter = len(A & B)
    union = len(A | B)
    return inter / union if union else 0.0

def char_ngrams(s: str, n: int = 3) -> Counter:
    s = normalize(s)
    if len(s) < n:
        return Counter([s]) if s else Counter()
    return Counter(s[i:i+n] for i in range(len(s) - n + 1))

def cosine_char_3gram(a: str, b: str) -> float:
    ca, cb = char_ngrams(a, 3), char_ngrams(b, 3)
    if not ca and not cb:
        return 1.0
    # dot
    keys = set(ca) | set(cb)
    dot = sum(ca[k] * cb[k] for k in keys)
    na = math.sqrt(sum(v*v for v in ca.values()))
    nb = math.sqrt(sum(v*v for v in cb.values()))
    return (dot / (na * nb)) if (na > 0 and nb > 0) else 0.0

def levenshtein(a: str, b: str) -> int:
    # Memory-efficient Wagnerâ€“Fischer (O(min(n,m)) space)
    a, b = normalize(a), normalize(b)
    if a == b:
        return 0
    if len(a) < len(b):
        a, b = b, a
    prev = list(range(len(b)+1))
    for i, ca in enumerate(a, 1):
        curr = [i]
        for j, cb in enumerate(b, 1):
            ins = curr[j-1] + 1
            dele = prev[j] + 1
            sub = prev[j-1] + (ca != cb)
            curr.append(min(ins, dele, sub))
        prev = curr
    return prev[-1]

def cer_similarity(a: str, b: str) -> float:
    # 1 - (edit_distance / max_len)
    a_n, b_n = normalize(a), normalize(b)
    max_len = max(len(a_n), len(b_n))
    if max_len == 0:
        return 1.0
    return 1.0 - (levenshtein(a_n, b_n) / max_len)

# ---------- wrapper ----------
def page_similarity(a: str, b: str) -> Dict[str, float]:
    m1 = seqmatch_ratio(a, b)
    m2 = jaccard_tokens(a, b)
    m3 = cosine_char_3gram(a, b)
    m4 = cer_similarity(a, b)
    # Simple composite (tune weights as you like)
    composite = 0.25*m1 + 0.25*m2 + 0.30*m3 + 0.20*m4
    return {
        "seqmatch_ratio": m1,
        "jaccard_tokens": m2,
        "cosine_char_3gram": m3,
        "cer_similarity": m4,
        "composite": composite,
    }

# ---------- example usage ----------
# full_text_base[i] and full_text_small[i] are your per-page strings
def compare_pages(full_text_base: List[str], full_text_small: List[str], i: int = 10):
    scores = page_similarity(full_text_base[i], full_text_small[i])
    print(f"Page {i} similarity:")
    for k, v in scores.items():
        print(f"  {k:>18}: {v:.5f}")

In [None]:
compare_pages()