## 该实例针对markdown文件处理格式

In [None]:
import re
import os
import json
import markdown2 
from bs4 import BeautifulSoup
from typing import List, Dict

def detect_language(text: str) -> str:
    """自动检测文本主要语言（中文/英文）"""
    chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
    english_chars = len(re.findall(r'[a-zA-Z]', text))
    return 'zh' if chinese_chars > english_chars else 'en'

def split_text_into_chunks(
        text: str,
        max_tokens: int = 500,
        overlap: int = 50,
        language: str = None
) -> List[str]:
    """将长文本切分为多个chunk，保留单词完整性并支持重叠

    Args:
        text: 输入文本
        max_tokens: 每个chunk的最大长度
        overlap: 相邻chunk之间的重叠长度
        language: 可选强制指定语言('zh'/'en')，默认自动检测

    Returns:
        切分后的chunk列表
    """
    # 清理文本并检测语言
    cleaned = re.sub(r'\s+', ' ', text).strip()
    if not cleaned:
        return []

    lang = language or detect_language(cleaned)

    # 中英文不同的分割逻辑
    if lang == 'zh':
        # 中文按句子分割（保留标点）
        sentences = [s for s in re.split(r'(?<=[。！？.!?])', cleaned) if s]
    else:
        # 英文按句子分割（保留单词完整性）
        sentences = [s for s in re.split(r'(?<=[.!?])\s+', cleaned) if s]

    chunks = []
    current_chunk = ""

    for sentence in sentences:
        # 处理超长句子（超过max_tokens）
        if len(sentence) > max_tokens:
            if current_chunk:
                chunks.append(current_chunk)
                current_chunk = ""

            if lang == 'zh':
                # 中文按字符分割（保留重叠）
                chunks.extend([sentence[i:i + max_tokens]
                               for i in range(0, len(sentence), max_tokens - overlap)])
            else:
                # 英文按单词分割（保留单词完整性）
                words = sentence.split()
                current_words = []
                for word in words:
                    if len(' '.join(current_words + [word])) <= max_tokens:
                        current_words.append(word)
                    else:
                        chunks.append(' '.join(current_words))
                        # 保留重叠部分（从尾部取单词）
                        overlap_words = current_words[-overlap:] if overlap < len(current_words) else current_words
                        current_words = overlap_words + [word]
                if current_words:
                    chunks.append(' '.join(current_words))
            continue

        # 正常句子处理
        if len(current_chunk) + len(sentence) > max_tokens:
            if chunks:
                # 计算实际重叠长度（不超过剩余空间）
                effective_overlap = min(
                    overlap,
                    len(current_chunk),
                    max_tokens - len(sentence)
                )
                overlap_part = current_chunk[-effective_overlap:]
                chunks.append(current_chunk)
                current_chunk = overlap_part + sentence
            else:
                # 第一个chunk直接超限
                chunks.append(current_chunk)
                current_chunk = sentence
        else:
            current_chunk += sentence

    if current_chunk:
        chunks.append(current_chunk)

    # 后处理：确保所有chunk都不超限
    final_chunks = []
    for chunk in chunks:
        if len(chunk) > max_tokens:
            if lang == 'zh':
                final_chunks.extend([chunk[i:i + max_tokens]
                                     for i in range(0, len(chunk), max_tokens - overlap)])
            else:
                words = chunk.split()
                current_words = []
                for word in words:
                    if len(' '.join(current_words + [word])) <= max_tokens:
                        current_words.append(word)
                    else:
                        final_chunks.append(' '.join(current_words))
                        current_words = [word]
                if current_words:
                    final_chunks.append(' '.join(current_words))
        else:
            final_chunks.append(chunk)

    return final_chunks


In [35]:

def extract_text_from_md(file_path: str) -> str:
    """提取MD文件中的文本内容"""
    with open(file_path, 'r', encoding='utf-8') as f:
        md_content = f.read()
    
    html = markdown2.markdown(md_content)
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text()


In [36]:

def save_chunks_to_json(chunks: List[str], output_path: str) -> None:
    """将文本块保存为JSON文件"""
    data = [{"id": i, "text": chunk, "length": len(chunk)} for i, chunk in enumerate(chunks)]
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


In [37]:

def process_md_file(input_path: str, output_path: str, max_tokens: int = 400) -> None:
    """处理单个MD文件，分块并保存为JSON"""
    try:
        text = extract_text_from_md(input_path)
        chunks = split_text_into_chunks(text, max_tokens=max_tokens)
        
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        save_chunks_to_json(chunks, output_path)
        
        print(f"已处理 {input_path}，生成 {len(chunks)} 个文本块")
        print(f"保存到 {output_path}")
    except Exception as e:
        print(f"处理文件 {input_path} 时出错: {e}")


In [38]:

def batch_process_md_files(input_dir: str, output_dir: str, max_tokens: int = 400) -> None:
    """批量处理目录下的所有MD文件"""
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.lower().endswith('.md'):
                input_path = os.path.join(root, file)
                relative_path = os.path.relpath(input_path, input_dir)
                output_filename = os.path.splitext(relative_path)[0] + '_chunks.json'
                output_path = os.path.join(output_dir, output_filename)
                
                process_md_file(input_path, output_path, max_tokens)


In [39]:

if __name__ == "__main__":
    INPUT_DIR = "./datas/"  # MD文件目录
    OUTPUT_DIR = "./datas/output"  # 输出JSON目录
    
    batch_process_md_files(INPUT_DIR, OUTPUT_DIR,400)

已处理 ./datas/PSDF A Multi-Level Feature-Based Ponzi Scheme Detection Framework for Smart Contracts in Ethereum.md，生成 254 个文本块
保存到 ./datas/output/PSDF A Multi-Level Feature-Based Ponzi Scheme Detection Framework for Smart Contracts in Ethereum_chunks.json
已处理 ./datas/中国文化通史明国卷.md，生成 1008 个文本块
保存到 ./datas/output/中国文化通史明国卷_chunks.json
