In [1]:

import re
import os
from html.parser import HTMLParser

# 1. Parse HTML
paragraphs = []
current_text = []

class MyHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        # Tags to ignore content from
        self.ignore_tags = {'style', 'script', 'title', 'xml'}
        self.current_tag = None
        self.ignore = False

    def handle_starttag(self, tag, attrs):
        if tag in self.ignore_tags:
            self.ignore = True
        self.current_tag = tag

    def handle_endtag(self, tag):
        global current_text
        if tag in self.ignore_tags:
            self.ignore = False
        
        if tag == 'p':
            raw_text = "".join(current_text).replace('\xa0', ' ').replace('&nbsp;', ' ').strip()
            clean_text = raw_text.strip()
            if clean_text:
                paragraphs.append(clean_text)
            current_text = []
        elif tag == 'br':
             current_text.append('\n')

    def handle_data(self, data):
        if not self.ignore:
            current_text.append(data)

with open('content/text/回溯卷1-10.html', 'r', encoding='utf-8') as f:
    content = f.read()

parser = MyHTMLParser()
parser.feed(content)

# 2. Extract Chapters
chapters = []
current_chapter = None

# Regex for chapter headers
def is_chapter_title(text):
    # Remove whitespace for check
    t = re.sub(r'\s+', '', text)
    # Check for "回溯卷" at start
    if '回溯卷' in t and '第' in t and '章' in t:
        if len(t) < 80: 
            return True
    return False

for p in paragraphs:
    if is_chapter_title(p):
        if current_chapter:
            chapters.append(current_chapter)
        current_chapter = {'title': p, 'content': []}
    else:
        if current_chapter:
            current_chapter['content'].append(p)

if current_chapter:
    chapters.append(current_chapter)

# 3. Write files
output_dir = 'content/docs/01-reconnection'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

file_mapping = []

chapters_to_process = chapters
print(f"Found {len(chapters_to_process)} chapters.")

for i, chap in enumerate(chapters_to_process):
    # Clean title for frontmatter
    title = chap['title']
    content_body = "\n\n".join(chap['content'])
    
    # Check numbering. 
    suffix = str(i + 1).zfill(2)
    filename = f"03-retrospect-{suffix}.mdx"
    filepath = os.path.join(output_dir, filename)
    
    # Generate MDX
    mdx_content = f"""---
title: {title}
---

{content_body}
"""
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(mdx_content)
    
    file_mapping.append(filename.replace('.mdx', ''))
    print(f"Created {filename}: {title}")

print("\nFiles created.")
print(file_mapping)


Found 11 chapters.
Created 03-retrospect-01.mdx: 回溯卷·第一章(陆):象牙塔里的异乡人
Created 03-retrospect-02.mdx: 回溯卷·第二章(沈):新婚之夜的虚假和平
Created 03-retrospect-03.mdx: 回溯卷·第三章(陆):第一次试错式恋爱
Created 03-retrospect-04.mdx: 回溯卷·第四章(沈):无性婚姻的确立
Created 03-retrospect-05.mdx: 回溯卷·第五章(陆):海德格尔的召唤
Created 03-retrospect-06.mdx: 回溯卷·第六章(沈):深夜的交友软件
Created 03-retrospect-07.mdx: 回溯卷·第七章(陆):疫情中的封闭
Created 03-retrospect-08.mdx: 回溯卷·第八章(沈):离婚前夜
Created 03-retrospect-09.mdx: 回溯卷·第九章(陆):来上海的决定
Created 03-retrospect-10.mdx: 回溯卷 · 第 9 章：靠近的渐近线
Created 03-retrospect-11.mdx: 回溯卷 · 第 10 章：系统的归零重置

Files created.
['03-retrospect-01', '03-retrospect-02', '03-retrospect-03', '03-retrospect-04', '03-retrospect-05', '03-retrospect-06', '03-retrospect-07', '03-retrospect-08', '03-retrospect-09', '03-retrospect-10', '03-retrospect-11']
