
1. **LoadingService**: 新增 `load_epub`，支持提取 EPUB 文本与图片元数据。
2. **ParsingService**: 新增 `extract_tables_from_pdf`，集成 `pdfplumber` 实现精确的表格结构化提取。
3. **ChunkingService**: 增强 `chunk_text`，新增对结构化表格数据的专用分块逻辑。

In [1]:
import sys
import os
import json
from IPython.display import display, JSON, Image

# 添加 backend 路径
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, '..'))
backend_path = os.path.join(project_root, 'backend')

if backend_path not in sys.path:
    sys.path.append(backend_path)

print(f"Backend path added: {backend_path}")

# 导入服务
from services.loading_service import LoadingService
from services.parsing_service import ParsingService
from services.chunking_service import ChunkingService

# 初始化服务
loading_service = LoadingService()
parsing_service = ParsingService()
chunking_service = ChunkingService()

# 资源路径
asset_dir = os.path.join(project_root, 'asset')
epub_path = os.path.join(asset_dir, "投资第一课 (孟岩) (Z-Library).epub")
pdf_path = os.path.join(asset_dir, "billionaires_page-1-5.pdf")

print(f"EPUB Path: {epub_path}")
print(f"PDF Path: {pdf_path}")

Backend path added: d:\My Data\Rag\rag-project01-framework\backend


  from .autonotebook import tqdm as notebook_tqdm


EPUB Path: d:\My Data\Rag\rag-project01-framework\asset\投资第一课 (孟岩) (Z-Library).epub
PDF Path: d:\My Data\Rag\rag-project01-framework\asset\billionaires_page-1-5.pdf


In [None]:
# 调用 `LoadingService.load_epub`，它会自动解压 EPUB，提取所有 HTML 文本，并识别图片资源。
if os.path.exists(epub_path):
    print("正在加载 EPUB 文件...")
    # 1. 加载 EPUB
    epub_text = loading_service.load_epub(epub_path)
    
    # 2. 展示文本统计
    print(f"\n加载完成！提取文本长度: {len(epub_text)} 字符")
    print(f"包含章节/文件数: {len(loading_service.current_page_map)}")
    
    # 3. 展示图片元数据
    images = getattr(loading_service, 'current_images', [])
    print(f"\n发现图片资源: {len(images)} 张")
    if images:
        print("前 5 张图片信息:")
        print(json.dumps(images[:5], indent=2, ensure_ascii=False))
        
    # 4. 展示部分文本内容
    print("\n--- 文本内容预览 (前 500 字符) ---")
    print(epub_text[:500] + "...")
else:
    print("EPUB 文件不存在，跳过此步骤。")

正在加载 EPUB 文件...

加载完成！提取文本长度: 169396 字符
包含章节/文件数: 18

发现图片资源: 171 张
前 5 张图片信息:
[
  {
    "filename": "OEBPS/Images/0000_logo-square-727a22ab924433bf624c1e7aa0b05dae.png",
    "path": "OEBPS/Images/0000_logo-square-727a22ab924433bf624c1e7aa0b05dae.png",
    "type": "image"
  },
  {
    "filename": "OEBPS/Images/0001_01ERETD53FSSZRW8TQC7AGRW10.png",
    "path": "OEBPS/Images/0001_01ERETD53FSSZRW8TQC7AGRW10.png",
    "type": "image"
  },
  {
    "filename": "OEBPS/Images/0002_01EC9BB814C3KA6T3BCVD2P0AV.jpg",
    "path": "OEBPS/Images/0002_01EC9BB814C3KA6T3BCVD2P0AV.jpg",
    "type": "image"
  },
  {
    "filename": "OEBPS/Images/0003_badge-employee-3e0ee5bdffa77d712d5cc795c20c2aa4.png",
    "path": "OEBPS/Images/0003_badge-employee-3e0ee5bdffa77d712d5cc795c20c2aa4.png",
    "type": "image"
  },
  {
    "filename": "OEBPS/Images/0004_01EDT6NYMVM7HVFXZ66M5RN4B5.jpg",
    "path": "OEBPS/Images/0004_01EDT6NYMVM7HVFXZ66M5RN4B5.jpg",
    "type": "image"
  }
]

--- 文本内容预览 (前 500 字符) ---
我是谁？
导读 

In [None]:
#调用 `ParsingService.extract_tables_from_pdf`，直接利用 `pdfplumber` 的能力提取表格，而不是将其视为纯文本。
if os.path.exists(pdf_path):
    print("正在解析 PDF 表格...")
    # 1. 提取表格
    tables = parsing_service.extract_tables_from_pdf(pdf_path)
    
    print(f"\n解析完成！共提取 {len(tables)} 个表格")
    
    if tables:
        # 展示第一个表格的结构
        first_table = tables[0]
        print(f"\n--- 表格 1 (Page {first_table['page']}) 预览 ---")
        print(f"维度: {first_table['metadata']['rows']} 行 x {first_table['metadata']['cols']} 列")
        
        # 简单展示前几行数据
        print("前 3 行数据:")
        for row in first_table['data'][:3]:
            print(row)
else:
    print("PDF 文件不存在，跳过此步骤。")

正在解析 PDF 表格...

解析完成！共提取 7 个表格

--- 表格 1 (Page 1) 预览 ---
维度: 15 行 x 1 列
前 3 行数据:
["List of the world's billionaires, ranked in order of net worth"]
["The net worth of the world's billionaires increased from\nless than US$1 trillion in 2000 to over $7 trillion in 2015."]
['Publication details']


In [None]:
#调用 `ChunkingService.chunk_text`，它现在能够智能识别传入的是“结构化表格数据”，并应用专门的 `by_table` 分块策略，生成语义完整的 Chunk。
if os.path.exists(pdf_path) and tables:
    print("正在对表格数据进行分块...")
    
    # 1. 构造 Metadata
    doc_metadata = {
        "filename": os.path.basename(pdf_path),
        "loading_method": "pdfplumber"
    }
    
    # 2. 执行分块 (自动触发 _chunk_tables 逻辑)
    chunked_doc = chunking_service.chunk_text(
        text="", # 对于表格分块，text 参数被忽略
        method="by_table", # 这个参数在内部逻辑中其实被 page_map 的类型覆盖了，但为了语义清晰我们传它
        metadata=doc_metadata,
        page_map=tables # 传入结构化的表格数据
    )
    
    # 3. 展示结果 JSON 结构
    print(f"\n分块完成！生成 {chunked_doc['total_chunks']} 个 Chunk")
    
    print("\n--- 最终文档 JSON 结构 (预览第一个 Chunk) ---")
    print(json.dumps(chunked_doc['chunks'][0], indent=2, ensure_ascii=False))
    
    # 4. 保存为 JSON 文件 (模拟保存)
    output_file = "homework_5_result.json"
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(chunked_doc, f, indent=2, ensure_ascii=False)
    print(f"\n完整结果已保存至: {output_file}")
else:
    print("跳过分块步骤。")

正在对表格数据进行分块...

分块完成！生成 7 个 Chunk

--- 最终文档 JSON 结构 (预览第一个 Chunk) ---
{
  "content": "Table Data (Page 1):\n[[\"List of the world's billionaires, ranked in order of net worth\"], [\"The net worth of the world's billionaires increased from\\nless than US$1 trillion in 2000 to over $7 trillion in 2015.\"], [\"Publication details\"], [\"Publisher Whale Media Investments\\nForbes family\"], [\"Publication Forbes\"], [\"First published March 1987[1]\"], [\"Latest publication April 4, 2023\"], [\"Current list details (2023)[2]\"], [\"Wealthiest Bernard Arnault\"], [\"Net worth (1st) US$211 billion\"], [\"Number of 2,640 (from 2668)\\nbillionaires\"], [\"Total list net worth US$12.2 trillion (from US$ 12.7\\nvalue trillion)\"], [\"Number of women 337\"], [\"New members to the 150\\nlist\"], [\"Forbes: The World's Billionaires website (https://www.forb\\nes.com/billionaires/)\"]]",
  "metadata": {
    "chunk_id": 1,
    "page_number": 1,
    "page_range": "1",
    "type": "table",
    "table_i