In [11]:
# flatten_keep_tables_and_headings_drop_cont_section_keep_fix_empty_h.py
from bs4 import BeautifulSoup, NavigableString, Tag
from pathlib import Path
import html, re, os

IN_PATH  = "삼성전자_감사보고서_2014_2024/감사보고서_2024.htm"
OUT_PATH = os.path.splitext(IN_PATH)[0] + "_preprocess.html"

KEEP_TAGS   = {"table","h1","h2","h3","h4","h5","h6"}
BLOCK_BREAK = {"p","div","li","ul","ol","section","article","header","footer",
               "aside","address","pre","blockquote","tr"}
SKIP_TAGS   = {"script","style","noscript"}

def normalize_ws(s: str) -> str:
    s = s.replace("\xa0"," ")
    s = re.sub(r"[ \t]+"," ", s)
    return s

RE_P_CONT_HEADING = re.compile(r"""
    ^\s*["']?
    \d{1,3}
    (?:\s*[.)])?
    \s*[^:\n]*?
    ,?\s*계\s*속
    \s*[:：;]?\s*["']?\s*$
""", re.X)

def is_continuation_p(text_line: str) -> bool:
    s = normalize_ws(text_line).strip().strip('"""')
    return bool(RE_P_CONT_HEADING.match(s))

RE_SOLO_CONT_LINE = re.compile(r"""
    ^\s*["']?
    계\s*속
    \s*[:：;]?\s*
    ["']?\s*$
""", re.X)

def is_solo_continuation(text_line: str) -> bool:
    s = normalize_ws(text_line).strip()
    return bool(RE_SOLO_CONT_LINE.match(s))

def has_section_class(tag: Tag) -> bool:
    classes = [str(c).lower() for c in (tag.get("class") or [])]
    return any("section" in c for c in classes)

def preprocess_remove_empty_section_headings(soup: BeautifulSoup):
    """비어있는 h* 중 class에 'SECTION' 포함된 요소는 제거(다음 p는 그대로 둠)."""
    for h in soup.find_all(re.compile(r"^h[1-6]$", re.I)):
        if has_section_class(h) and not h.get_text(strip=True):
            h.decompose()  # h 제거 (뒤따르는 p는 유지됨)

def traverse_stream(node: Tag):
    """문서 순서대로 ('text', str) 또는 ('keep', html) 토큰 생성."""
    for child in getattr(node, "children", []):
        if isinstance(child, NavigableString):
            txt = normalize_ws(str(child))
            if txt:
                yield ("text", txt)
            continue
        if not isinstance(child, Tag):
            continue

        name = (child.name or "").lower()
        if name in SKIP_TAGS:
            continue

        if name == "br":
            yield ("text", "\n")
            continue

        # --- 우선: 비어있는 h.section* 는 건너뛴다(보존하지 않음)
        if name in {"h1","h2","h3","h4","h5","h6"} and has_section_class(child):
            if not child.get_text(strip=True):
                continue  # ← h 제거 효과

        # class에 'SECTION' 포함된 요소는 통째 보존 (outerHTML)
        if has_section_class(child):
            yield ("keep", str(child))
            continue

        # 기본 보존 대상: h1~h6, table
        if name in KEEP_TAGS:
            yield ("keep", str(child))
            continue

        # 그 외는 텍스트만 수집
        yield from traverse_stream(child)

        if name in BLOCK_BREAK:
            yield ("text", "\n")

def flatten_preserve(html_bytes: bytes):
    soup = BeautifulSoup(html_bytes, "lxml")

    # 스킵 태그 제거
    for tag in soup.find_all(SKIP_TAGS):
        tag.decompose()

    # ★ 비어있는 h.section* 제거
    preprocess_remove_empty_section_headings(soup)

    body = soup.body or soup
    tokens = list(traverse_stream(body))

    out_tokens, buf = [], ""
    def flush_buf():
        nonlocal buf
        if not buf:
            return
        for line in buf.split("\n"):
            line = normalize_ws(line).strip()
            if not line:
                continue
            if is_continuation_p(line) or is_solo_continuation(line):
                continue
            out_tokens.append(("p", line))
        buf = ""

    for kind, payload in tokens:
        if kind == "text":
            buf += payload
        else:
            flush_buf()
            out_tokens.append(("keep", payload))

    flush_buf()
    return out_tokens

def write_flat_html(tokens, out_path):
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(
            '<!doctype html><meta charset="utf-8"><title>Preprocessed</title>'
            '<style>body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Apple SD Gothic Neo,Malgun Gothic,sans-serif;line-height:1.5}'
            'p{margin:0 0 .7em} table{margin:1em 0;border-collapse:collapse}'
            'td,th{border:1px solid #ddd;padding:.4em}</style><body>\n'
        )
        for kind, payload in tokens:
            if kind == "p":
                f.write(f"<p>{html.escape(payload)}</p>\n")
            else:
                f.write(payload + "\n")
        f.write("</body>")
    return out_path

if __name__ == "__main__":
    html_bytes = Path(IN_PATH).read_bytes()
    tokens = flatten_preserve(html_bytes)
    out_file = write_flat_html(tokens, OUT_PATH)
    print("Paragraphs:", sum(1 for k,_ in tokens if k=="p"))
    print("Kept:", sum(1 for k,_ in tokens if k=="keep"))
    print("Saved:", out_file)

Paragraphs: 410
Kept: 288
Saved: 삼성전자_감사보고서_2014_2024/감사보고서_2024_preprocess.html
