In [None]:
from openai import OpenAI
from pathlib import Path
import os

# 確認環境變數是否讀到
print("API Key 存在？", bool(os.getenv("OPENAI_API_KEY")))
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


In [None]:
from pathlib import Path
from typing import Iterable, Iterator, List

# 設定模型與檔案路徑，請依實際情況更新
model = "gpt-4o-mini"
# model = "gpt-5-2025-08-07"
client = OpenAI()

# 嘗試自動定位專案根目錄
possible_roots = [
    Path.cwd(),
    Path.cwd() / "work",
    Path.cwd().parent,
    Path.cwd().parent / "work",
]
base_dir = None
for root in possible_roots:
    candidate = root / "_Material/_命令/整段命令.md"
    if candidate.exists():
        base_dir = root
        break

if base_dir is None:
    raise FileNotFoundError(
        "在以下目錄中找不到 _Material/_命令/整段命令.md：{}".format(
            [str(p) for p in possible_roots]
        )
    )

instructions_path = base_dir / "_Material/_命令/整段命令.md"
transcript_path = base_dir / "mkdocs/My_Notes/課_四34_所得税法四/逐字稿/W04_0925.md"
context_paths: List[Path] = [
    base_dir / "_Material/法源/所得稅法四",
    # base_dir / "mkdocs/My_Notes/課_四34_所得税法四",
]
output_path = base_dir / "notebooks/clean_notes_output.md"

# 限制參數
MAX_CONTEXT_CHARS = 60_000
TRANSCRIPT_CHUNK_CHARS = 12_000
TRANSCRIPT_CHUNK_OVERLAP = 400

instructions = instructions_path.read_text(encoding="utf-8")
raw_transcript = transcript_path.read_text(encoding="utf-8")


def load_context_text(path: Path) -> str:
    suffix = path.suffix.lower()
    if suffix == ".pdf":
        try:
            from pypdf import PdfReader
        except ImportError as exc:
            raise ImportError("需要先安裝 pypdf（pip install pypdf）才能讀取 PDF 補充資料。") from exc

        reader = PdfReader(str(path))
        pages = []
        for idx, page in enumerate(reader.pages, start=1):
            text = page.extract_text() or ""
            pages.append(f"[第 {idx} 頁]\n{text.strip()}")
        combined = "\n\n".join(pages).strip()
        if not combined:
            raise ValueError(f"PDF 補充資料未擷取到文字：{path}")
        return combined

    try:
        return path.read_text(encoding="utf-8")
    except UnicodeDecodeError:
        return path.read_text(encoding="utf-8", errors="ignore")


def iter_context_files(paths: Iterable[Path]) -> Iterable[Path]:
    allowed_suffixes = {".md", ".txt", ".pdf"}
    for raw_path in paths:
        resolved = raw_path if raw_path.is_absolute() else (base_dir / raw_path)
        if not resolved.exists():
            raise FileNotFoundError(f"找不到補充資料：{resolved}")
        # 🚫 跳過含有「逐字稿」的路徑
        if "逐字稿" in str(resolved):
            continue
        if resolved.is_file():
            if resolved.suffix.lower() in allowed_suffixes:
                yield resolved
            continue
        for child in sorted(resolved.rglob('*')):
            # 🚫 同樣排除子路徑含有「逐字稿」的情況
            if "逐字稿" in str(child):
                continue
            if (
                child.is_file()
                and child.suffix.lower() in allowed_suffixes
                and not child.name.startswith('.')
            ):
                yield child


def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
    text = text.strip()
    if not text:
        return []
    if chunk_size <= 0:
        return [text]

    chunks = []
    start = 0
    n = len(text)
    while start < n:
        end = min(n, start + chunk_size)
        chunks.append(text[start:end])
        if end == n:
            break
        start = end - overlap
        if start < 0:
            start = 0
    return chunks


def estimate_tokens(chars: int) -> int:
    return max(1, chars // 4)


# --- 處理補充資料 ---
context_sections = []
context_total_chars = 0
skipped_contexts = []
for i, ctx_file in enumerate(iter_context_files(context_paths), start=1):
    ctx_text = load_context_text(ctx_file).strip()
    if not ctx_text:
        continue
    try:
        rel_name = ctx_file.relative_to(base_dir)
    except ValueError:
        rel_name = ctx_file
    section = f"[補充資料 {i}: {rel_name}]\n{ctx_text}"
    projected = context_total_chars + len(section)
    if projected > MAX_CONTEXT_CHARS:
        skipped_contexts.append(str(rel_name))
        continue
    context_sections.append(section)
    context_total_chars = projected

context_blob = "\n\n".join(context_sections)
transcript_chunks = chunk_text(raw_transcript, TRANSCRIPT_CHUNK_CHARS, TRANSCRIPT_CHUNK_OVERLAP)
if not transcript_chunks:
    raise ValueError("逐字稿空白或未成功讀取內容。")

print("專案根目錄：", base_dir)
print("使用模型：", model)
print("已載入的補充資料檔數：", len(context_sections))
if skipped_contexts:
    print("⚠️ 已超過補充資料上限，未納入的檔案：")
    for name in skipped_contexts:
        print("   - ", name)
print("逐字稿分段數：", len(transcript_chunks))

context_tokens = estimate_tokens(len(context_blob))
print("補充資料估計 tokens：", context_tokens)

chunk_token_estimates = []
for idx, chunk_text_value in enumerate(transcript_chunks, start=1):
    chunk_tokens = estimate_tokens(len(chunk_text_value))
    chunk_token_estimates.append(chunk_tokens)
    print(f"第 {idx} 段估計 tokens：{chunk_tokens} (累計 {sum(chunk_token_estimates)})")

print("\n如果程式執行到一段約 1,000 字就中斷，大概對應 "
      f"{estimate_tokens(1_000)} tokens；只會計入當前請求已送出的 tokens。")
print("要得知實際扣款，請在 API 回傳物件中查看 usage.prompt_tokens 與 usage.completion_tokens。")


# --- 開始處理逐字稿 ---
chunk_results: List[str] = []
for idx, chunk_text_value in enumerate(transcript_chunks, start=1):
    user_parts: List[str] = []
    if context_blob:
        user_parts.append(context_blob)
    user_parts.append(f"[逐字稿（第 {idx} 段/共 {len(transcript_chunks)} 段）]\n{chunk_text_value}")
    user_content = "\n\n".join(user_parts)

    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_content},
    ]

    print(f"\n▶️ 正在處理第 {idx} 段...")
    stream = client.chat.completions.create(
        model=model,
        messages=messages,
        stream=True,
    )

    chunk_pieces: List[str] = []
    for chunk in stream:
        choice = chunk.choices[0]
        delta = getattr(choice, "delta", None)
        if not delta:
            continue
        piece = getattr(delta, "content", None)
        if not piece:
            continue
        chunk_pieces.append(piece)
        print(piece, end="", flush=True)

    chunk_output = "".join(chunk_pieces)
    print(f"\n\n✅ 第 {idx} 段完成")
    chunk_results.append(chunk_output)

print("\n全部段落已完成，正在匯出...")
final_markdown = "\n\n".join(chunk_results)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(final_markdown, encoding="utf-8")
print(f"整理後的逐字稿已寫入：{output_path}")
final_markdown