In [2]:
# 1) インストール
%pip -q install -U langchain langchain-openai langchain-text-splitters langchain-chroma chromadb tiktoken langchain-community
%pip -q install pypdf python-docx bs4 chardet
%pip -q install -U rank_bm25 sudachipy sudachidict_full sudachidict_core
%pip -q install -U langchain-classic
# 2) 環境変数にOpenAI APIキー
import os
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = userdata.get("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = "agent-book"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.8/20.8 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00

In [26]:
# == Google Drive をマウント ==
from google.colab import drive
drive.mount('/content/drive')

# == データ配置先（ご指定のパス） ==
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/rag_week1"
DOC_DIR = os.path.join(BASE_DIR, "docs")  # mdファイル群のフォルダ
METADATA_CSV = os.path.join(BASE_DIR, "metadata.csv")
QA_JSONL = os.path.join(BASE_DIR, "qa.jsonl")
INDEX_DIR = f"{BASE_DIR}/chroma_index"
RESULTS_DIR = f"{BASE_DIR}/results"
os.makedirs(RESULTS_DIR, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
from langsmith import Client
import json, datetime
import pandas as pd

client = Client()

# == metadata.csv の読み込み ==
# 期待カラム: id, path, title, category, effective_date, confidentiality, department, product_type
df_meta = pd.read_csv(METADATA_CSV, sep=r"\s*\t\s*|\s{2,}|,", engine="python").fillna("")
expected_cols = ["id","path","title","category","effective_date","confidentiality","department","product_type"]
missing = [c for c in expected_cols if c not in df_meta.columns]
if missing:
    raise ValueError(f"metadata.csv に必要カラムが不足しています: {missing}\n 取得カラム: {list(df_meta.columns)}")

# ID→メタ情報 の辞書
meta_by_id = {
    str(row["id"]).strip(): {
        "path": str(row["path"]).strip(),
        "title": str(row["title"]).strip(),
        "category": str(row["category"]).strip(),
        "effective_date": str(row["effective_date"]).strip(),
        "confidentiality": str(row["confidentiality"]).strip(),
        "department": str(row["department"]).strip(),
        "product_type": str(row["product_type"]).strip(),
    }
    for _, row in df_meta.iterrows()
}

print(f"metadata 読込: {len(meta_by_id)} 件")

# == qa.jsonl の読み込み ==
qa_items = []
with open(QA_JSONL, "r", encoding="utf-8") as f:
    for line_no, line in enumerate(f, start=1):
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError as e:
            raise ValueError(f"qa.jsonl {line_no} 行目がJSONとして読めません: {e}")
        # 必須チェック
        for k in ["qid","question","answers","must_have_source_id"]:
            if k not in obj:
                raise ValueError(f"qa.jsonl {line_no} 行目に {k} がありません: {obj}")
        # 型/正規化
        qid = str(obj["qid"]).strip()
        question = str(obj["question"]).strip()
        answers = obj["answers"]
        if isinstance(answers, str):
            answers = [answers]
        answers = [str(a).strip() for a in answers if str(a).strip()]
        must_id = str(obj["must_have_source_id"]).strip()

        qa_items.append({
            "qid": qid,
            "question": question,
            "answers": answers,
            "must_id": must_id
        })

print(f"qa 読込: {len(qa_items)} 件")

# == 参照IDの整合性チェック ==
unknown_ids = sorted({qa["must_id"] for qa in qa_items if qa["must_id"] not in meta_by_id})
if unknown_ids:
    # 整合しないIDは警告しつつ、そのまま投入（後でLangSmith上で修正可能）
    print("WARNING: metadata.csv に存在しない must_have_source_id があります:", unknown_ids)

# == Dataset名（重複回避のため日時サフィックス付与推奨）==
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
dataset_name = f"agent-book_rag_week1_v0_{timestamp}"
print("作成する Dataset:", dataset_name)

# 既存チェック（同名があれば別名にする設計）
# LangSmithはDataset名がユニーク。ここでは try/except で作成します。
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="RAG Week1 QA dataset (agent-book) from /rag_week1"
)
print("Dataset 作成 OK:", dataset.id)

# == LangSmith Example へ一括投入 ==
created = 0
for qa in qa_items:
    doc_meta = meta_by_id.get(qa["must_id"], {})
    # Example.inputs / outputs / metadata 設計
    inputs = {
        "question": qa["question"],
        # 必要なら評価時に retriever へ渡すヒントをメタとして付与:
        # "hints": {"must_have_source_id": qa["must_id"]},
    }
    outputs = {
        # 正答は複数候補をそのまま持たせる（Exact match / 部分一致 evaluator 用）
        "answers": qa["answers"]
    }
    metadata = {
        "qid": qa["qid"],
        "must_have_source_id": qa["must_id"],
        "project": "agent-book",
        # ドキュメント側メタを埋め込む（スライス分析やデバッグ用）
        "doc": {
            "path": doc_meta.get("path",""),
            "title": doc_meta.get("title",""),
            "category": doc_meta.get("category",""),
            "effective_date": doc_meta.get("effective_date",""),
            "confidentiality": doc_meta.get("confidentiality",""),
            "department": doc_meta.get("department",""),
            "product_type": doc_meta.get("product_type",""),
        }
    }

    client.create_example(
        dataset_id=dataset.id,
        inputs=inputs,
        outputs=outputs,
        metadata=metadata
    )
    created += 1

print(f"登録完了: {created} 件を Dataset[{dataset_name}] に投入しました。")

# 参考: LangSmith UI での確認用（概形）
print("\n== 次の手順 ==")
print("1) LangSmith → Datasets → 上記 dataset_name を開き、Examples が期待通りか確認")
print("2) フェーズ4: 評価実行（RAGパイプラインに対して当データセットを一括評価）へ進む")


metadata 読込: 12 件
qa 読込: 5 件
作成する Dataset: agent-book_rag_week1_v0_20251106-084752
Dataset 作成 OK: 2b71b55b-c4c1-4506-b8b8-91808b2bb8f8
登録完了: 5 件を Dataset[agent-book_rag_week1_v0_20251106-084752] に投入しました。

== 次の手順 ==
1) LangSmith → Datasets → 上記 dataset_name を開き、Examples が期待通りか確認
2) フェーズ4: 評価実行（RAGパイプラインに対して当データセットを一括評価）へ進む


In [41]:
# DOCID→実ファイルパスの解決（フォルダ内を探索）
# 大文字・小文字どちらでも認識するよう (?i) を追加

import os, re, glob, pandas as pd

doc_path_by_id = {}

pattern = re.compile(r"(?i)(doc\d{3,})_")  # DOC001_..., doc005_, etc.

paths = glob.glob(f"{DOC_DIR}/*.md")
print(f"[INFO] 読み込み対象 .md ファイル: {len(paths)} 件 @ {DOC_DIR}")

for md_path in paths:
    fname = os.path.basename(md_path)
    m = pattern.match(fname)
    if m:
        # 正規化：大文字の DOC に揃える
        doc_id = m.group(1).upper()  # "doc006" → "DOC006"
        doc_path_by_id[doc_id] = md_path
    else:
        print(f"[WARN] DOCID判定できず → {fname}")

print(f"[INFO] DOCIDを解決できたファイル: {len(doc_path_by_id)} 件")

df_doc_paths = (
    pd.DataFrame(
        [{"doc_id": k, "doc_path": v} for k, v in doc_path_by_id.items()]
    )
    .sort_values("doc_id")
)

display(df_doc_paths)

[INFO] 読み込み対象 .md ファイル: 12 件 @ /content/drive/MyDrive/Colab Notebooks/rag_week1/docs
[INFO] DOCIDを解決できたファイル: 12 件


Unnamed: 0,doc_id,doc_path
6,DOC001,/content/drive/MyDrive/Colab Notebooks/rag_wee...
10,DOC002,/content/drive/MyDrive/Colab Notebooks/rag_wee...
7,DOC003,/content/drive/MyDrive/Colab Notebooks/rag_wee...
1,DOC004,/content/drive/MyDrive/Colab Notebooks/rag_wee...
5,DOC005,/content/drive/MyDrive/Colab Notebooks/rag_wee...
9,DOC006,/content/drive/MyDrive/Colab Notebooks/rag_wee...
4,DOC007,/content/drive/MyDrive/Colab Notebooks/rag_wee...
11,DOC008,/content/drive/MyDrive/Colab Notebooks/rag_wee...
8,DOC009,/content/drive/MyDrive/Colab Notebooks/rag_wee...
2,DOC010,/content/drive/MyDrive/Colab Notebooks/rag_wee...


In [42]:
# ==== 修正版: ドキュメント→チャンク→ベクター化 ====
import os, glob, re
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document as LCDocument
import shutil
from typing import List

def _read_text(path):
    for enc in ("utf-8", "utf-8-sig", "cp932"):
        try:
            return open(path, "r", encoding=enc, errors="ignore").read()
        except Exception:
            pass
    return open(path, "rb").read().decode("utf-8", errors="ignore")

raw_docs: List[LCDocument] = []
for doc_id, path in doc_path_by_id.items():
    text = _read_text(path)
    meta = meta_by_id.get(doc_id, {})
    raw_docs.append(
        LCDocument(
            page_content=text,
            metadata={
                "doc_id": doc_id,
                "title": meta.get("title",""),
                "category": meta.get("category",""),
                "effective_date": meta.get("effective_date",""),
                "confidentiality": meta.get("confidentiality",""),
                "department": meta.get("department",""),
                "product_type": meta.get("product_type",""),
                "path": path,
            }
        )
    )

print(f"[INFO] 読み込んだ原文書数: {len(raw_docs)}")
if len(raw_docs) == 0:
    raise RuntimeError("原文書0件。docsディレクトリと拡張子(.md)を確認してください。")

# ---- 分割 ----
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=900,
    chunk_overlap=150,
    separators=["\n\n", "\n", "。", "、", " ", ""],
)

chunks: List[LCDocument] = []
for d in raw_docs:
    for chunk in splitter.split_text(d.page_content):
        chunks.append(LCDocument(page_content=chunk, metadata=d.metadata))

print(f"[INFO] チャンク数: {len(chunks)}")
if len(chunks) == 0:
    raise RuntimeError("チャンク0件。分割ルールや原文の中身を確認してください。")

# ---- Chroma 永続化 ----
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

REBUILD_INDEX = False  # 既存インデックスを作り直す場合は True
if REBUILD_INDEX and os.path.exists(INDEX_DIR):
    shutil.rmtree(INDEX_DIR)

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=INDEX_DIR,
    collection_metadata={"hnsw:space": "cosine"}
)
vectorstore.persist()
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

print("[OK] Chroma index built at:", INDEX_DIR)
print("[OK] Retriever ready (k=4)")


[INFO] 読み込んだ原文書数: 12
[INFO] チャンク数: 12
[OK] Chroma index built at: /content/drive/MyDrive/Colab Notebooks/rag_week1/chroma_index
[OK] Retriever ready (k=4)


In [54]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
import textwrap

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

SYSTEM_PROMPT = """あなたは投資信託と社内規程ドキュメントに基づいて事実に忠実に回答するアシスタントです。
- 回答は日本語で、簡潔・正確に。
- 必ず与えられたコンテキスト（抜粋）に基づいて回答し、根拠となった文書IDとタイトルを列挙してください。
- 不明な場合は「不明」と答えてください。
"""

HUMAN_PROMPT = """質問:
{question}

# コンテキスト（参照抜粋）
{context}

# 出力フォーマット
- 回答: <短いテキスト>
- 参照:
  - <DOCID>: <タイトル>
  - ...
"""

prompt = ChatPromptTemplate.from_messages(
    [("system", SYSTEM_PROMPT), ("human", HUMAN_PROMPT)]
)

def format_ctx(docs) -> str:
    parts = []
    for i, d in enumerate(docs, 1):
        meta = getattr(d, "metadata", {}) or {}
        parts.append(
            f"[{i}] DOCID={meta.get('doc_id','')} TITLE={meta.get('title','')}\n"
            + textwrap.shorten((getattr(d, "page_content", "") or "").replace("\n", " "),
                               width=600, placeholder=" ...")
        )
    return "\n\n".join(parts)

def _retrieve(query: str):
    # LangChain新API（Runnable）: invoke
    if hasattr(retriever, "invoke"):
        return retriever.invoke(query)
    # 旧API: get_relevant_documents
    if hasattr(retriever, "get_relevant_documents"):
        return retriever.get_relevant_documents(query)
    raise TypeError("Retriever has neither .invoke nor .get_relevant_documents")

def run_rag(question: str):
    ctx_docs = _retrieve(question) or []
    # 一部の実装でgeneratorが返ることがあるのでリスト化
    try:
        ctx_docs = list(ctx_docs)
    except TypeError:
        pass

    ctx_text = format_ctx(ctx_docs)
    msg = prompt.format_messages(question=question, context=ctx_text)
    resp = llm.invoke(msg)
    answer = resp.content.strip()

    # 参照として使ったdoc_id（重複除去）
    source_ids = []
    for d in ctx_docs:
        did = (getattr(d, "metadata", {}) or {}).get("doc_id")
        if did and did not in source_ids:
            source_ids.append(did)

    return {
        "answer": answer,
        "source_ids": source_ids,
        "ctx_docs": ctx_docs
    }


In [55]:
from langsmith import Client

client = Client()
# "agent-book_rag_week1_v0_" で始まる最新データセットを取得
prefix = "agent-book_rag_week1_v0_"
candidate = [d for d in client.list_datasets() if d.name.startswith(prefix)]
if not candidate:
    raise RuntimeError("フェーズ3で作成したデータセットが見つかりません。prefixの変更 or セル末尾のdataset_nameを直接指定してください。")

dataset = sorted(candidate, key=lambda d: d.created_at)[-1]
dataset_name = dataset.name
print("使用するデータセット:", dataset_name, dataset.id)

使用するデータセット: agent-book_rag_week1_v0_20251106-074032 2239bf0a-1b38-479d-8179-5ab9244d34a5


In [59]:
import re, unicodedata

def _pad2(n: str) -> str:
    try:
        return f"{int(n):02d}"
    except:
        return n  # 想定外はそのまま

def normalize_dates(s: str) -> str:
    """和文/区切りの揺れを YYYY-MM-DD に正規化"""
    if s is None:
        return ""
    s = unicodedata.normalize("NFKC", s)

    # 例: 2025年8月1日 → 2025-08-01
    def repl_ymd_ja(m):
        y, mth, d = m.group(1), m.group(2), m.group(3)
        return f"{y}-{_pad2(mth)}-{_pad2(d)}"

    # 例: 2025/8/1, 2025-8-1, 2025-08-01 → 2025-08-01
    def repl_ymd_sep(m):
        y, mth, d = m.group(1), m.group(2), m.group(3)
        return f"{y}-{_pad2(mth)}-{_pad2(d)}"

    # 日本語表記
    s = re.sub(r"(\d{4})\s*年\s*(\d{1,2})\s*月\s*(\d{1,2})\s*日?", repl_ymd_ja, s)
    # 区切り（/ または -）
    s = re.sub(r"(\d{4})\s*[\/\-]\s*(\d{1,2})\s*[\/\-]\s*(\d{1,2})", repl_ymd_sep, s)

    return s

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    # まず日付を正規化（区切り削除より先に！）
    s = normalize_dates(s)
    s = unicodedata.normalize("NFKC", s)  # 全角→半角など
    s = s.replace("％", "%")               # %表記ゆれ
    s = s.strip().lower()
    # 空白を整理
    s = re.sub(r"\s+", " ", s)
    return s

def any_match(pred: str, answers) -> bool:
    """部分一致ベース。日付表記ゆれを吸収し、双方向（包含/被包含）で判定"""
    p = normalize_text(pred)
    if isinstance(answers, str):
        answers = [answers]
    for a in answers:
        na = normalize_text(a)
        if not na:
            continue
        if na in p or p in na:
            return True
    return False

def must_source_ok(used_source_ids, must_id: str) -> bool:
    return (must_id or "") in (used_source_ids or [])

In [60]:
from tqdm.auto import tqdm

examples = list(client.list_examples(dataset_id=dataset.id))
print("評価対象 Examples:", len(examples))

records = []
for ex in tqdm(examples):
    q = ex.inputs.get("question", "")
    gt_answers = ex.outputs.get("answers", [])
    must_id = (ex.metadata or {}).get("must_have_source_id", "")

    out = run_rag(q)
    pred = out["answer"]
    used_ids = out["source_ids"]

    # 指標
    correct = any_match(pred, gt_answers)
    source_ok = must_source_ok(used_ids, must_id)

    records.append({
        "qid": (ex.metadata or {}).get("qid", ""),
        "question": q,
        "prediction": pred,
        "answers": "; ".join(gt_answers),
        "used_source_ids": ",".join(used_ids),
        "must_have_source_id": must_id,
        "is_correct": int(correct),
        "has_must_source": int(source_ok)
    })

df_eval = pd.DataFrame.from_records(records)
acc = df_eval["is_correct"].mean() if len(df_eval) else 0.0
src_hit = df_eval["has_must_source"].mean() if len(df_eval) else 0.0

print(f"\n=== 集計 ===")
print(f"正答率 (Exact/部分一致ベース): {acc:.3f}")
print(f"必須ソース一致率: {src_hit:.3f}")

# 保存
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
out_csv = f"{RESULTS_DIR}/offline_eval_{timestamp}.csv"
df_eval.to_csv(out_csv, index=False, encoding="utf-8")
print("結果CSV:", out_csv)

# 失敗ケース上位を確認（簡易）
df_bad = df_eval[(df_eval["is_correct"]==0) | (df_eval["has_must_source"]==0)].copy()
display_cols = ["qid","question","prediction","answers","used_source_ids","must_have_source_id","is_correct","has_must_source"]
df_bad[display_cols].head(10)


評価対象 Examples: 5


  0%|          | 0/5 [00:00<?, ?it/s]


=== 集計 ===
正答率 (Exact/部分一致ベース): 1.000
必須ソース一致率: 0.800
結果CSV: /content/drive/MyDrive/Colab Notebooks/rag_week1/results/offline_eval_20251106-081540.csv


Unnamed: 0,qid,question,prediction,answers,used_source_ids,must_have_source_id,is_correct,has_must_source
4,Q001,サクラ短期国債ファンドの信託報酬は年率いくら？,- 回答: サクラ短期国債ファンドの信託報酬は年率0.18%です。\n- 参照:\n - ...,0.18%; 年率0.18%,"DOC001,DOC007",DOC006,1,0


In [61]:
# === LLM-as-a-Judge (OpenAI) を用いたカスタム評価器 ===
import re, json, textwrap
from typing import Dict, Any, List, Tuple
from langchain_openai import ChatOpenAI

# 判定用LLM（低温度＆短め応答で安定化）
judge_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=256)

def extract_docids_from_answer(answer: str) -> List[str]:
    """回答本文から 'DOC123' のような参照表記を抽出"""
    return sorted(set(re.findall(r"\bDOC\d{3,}\b", answer or "")))

def ctx_to_text(ctx_docs: List) -> str:
    parts = []
    for i, d in enumerate(ctx_docs, 1):
        m = (getattr(d, "metadata", {}) or {})
        head = f"[{i}] DOCID={m.get('doc_id','')} TITLE={m.get('title','')}"
        body = (getattr(d, "page_content", "") or "").replace("\n", " ")
        parts.append(head + "\n" + textwrap.shorten(body, width=700, placeholder=" ..."))
    return "\n\n".join(parts)

def _parse_score(text: str, default=0) -> Tuple[int, str]:
    """Judge応答から score を抽出（0/1）。JSON→数値→フォールバック."""
    # JSON優先
    try:
        data = json.loads(text)
        if isinstance(data, dict) and "score" in data:
            s = int(str(data["score"]).strip())
            return (1 if s == 1 else 0, text)
    except Exception:
        pass
    # 0/1 を直接含むか
    m = re.search(r"\bscore\s*[:=]\s*([01])\b", text, re.I)
    if m:
        return (int(m.group(1)), text)
    m2 = re.search(r"\b([01])\b", text)  # 最後の手段
    if m2:
        return (int(m2.group(1)), text)
    return (default, text)

# 1) 忠実性（コンテキスト逸脱がないか）
FAITHFULNESS_SYS = """あなたは審査員です。与えられたコンテキストに対して回答が忠実か判定します。
- 回答はコンテキストから正当に導けるか？（幻覚がないか）
- 過剰一般化や矛盾がないか？
出力は JSON で: {"score": 1 or 0, "reason": "<根拠>"} のみ。"""
def judge_faithfulness(answer: str, ctx_docs: List) -> Dict[str, Any]:
    ctx = ctx_to_text(ctx_docs)
    user = f"# 回答\n{answer}\n\n# コンテキスト\n{ctx}"
    res = judge_llm.invoke([{"role":"system","content":FAITHFULNESS_SYS},
                            {"role":"user","content":user}]).content.strip()
    score, _ = _parse_score(res, default=0)
    return {"faithfulness_score": score, "faithfulness_raw": res}

# 2) 質問との関連性（質問に直接答えているか）
RELEVANCE_SYS = """あなたは審査員です。回答が質問に適切に関連し、質問へ直接的に答えているか判定します。
出力は {"score": 1 or 0, "reason": "<根拠>"} のJSONのみ。"""
def judge_relevance(question: str, answer: str) -> Dict[str, Any]:
    user = f"# 質問\n{question}\n\n# 回答\n{answer}"
    res = judge_llm.invoke([{"role":"system","content":RELEVANCE_SYS},
                            {"role":"user","content":user}]).content.strip()
    score, _ = _parse_score(res, default=0)
    return {"relevance_score": score, "relevance_raw": res}

# 3) 意味的正答（フォーマット揺れを許容：例・日付/時刻/数値表現）
SEMANTIC_ACC_SYS = """あなたは審査員です。回答が期待される答え（複数候補のいずれか）と
意味的に同じ内容かを判定してください。表記揺れ（例: 2025年8月1日 / 2025/8/1 / 2025-08-01）や
%の全角半角、小数点や単位の軽微な表現差は正解として扱ってください。
出力は {"score": 1 or 0, "reason": "<根拠>"} のJSONのみ。"""
def judge_semantic_accuracy(question: str, answer: str, answers: List[str]) -> Dict[str, Any]:
    gold = "\n".join(f"- {a}" for a in answers)
    user = f"# 質問\n{question}\n\n# 回答\n{answer}\n\n# 期待される答え候補\n{gold}"
    res = judge_llm.invoke([{"role":"system","content":SEMANTIC_ACC_SYS},
                            {"role":"user","content":user}]).content.strip()
    score, _ = _parse_score(res, default=0)
    return {"semantic_acc": score, "semantic_acc_raw": res}

# 4) 引用妥当性（回答中や取得ソースに must_have_source_id が含まれるか）
def judge_citation(answer: str, used_source_ids: List[str], must_id: str) -> Dict[str, Any]:
    cited = extract_docids_from_answer(answer)
    ok = 1 if (must_id and (must_id in (used_source_ids or []) or must_id in cited)) else 0
    return {"citation_ok": ok, "cited_in_text": cited}

In [62]:
# === フェーズ4のセル7を拡張：Judgeスコアも集計 ===
import datetime
from tqdm.auto import tqdm

examples = list(client.list_examples(dataset_id=dataset.id))
print("評価対象 Examples:", len(examples))

records = []
for ex in tqdm(examples):
    q = ex.inputs.get("question", "")
    gt_answers = ex.outputs.get("answers", [])
    must_id = (ex.metadata or {}).get("must_have_source_id", "")

    # 1) RAG実行（既存の run_rag を使用）
    out = run_rag(q)
    pred = out["answer"]
    used_ids = out["source_ids"]
    ctx_docs = out["ctx_docs"]

    # 2) ルールベースの既存指標
    correct_rule = any_match(pred, gt_answers)
    source_ok = must_source_ok(used_ids, must_id)

    # 3) LLM-as-a-Judge
    jf = judge_faithfulness(pred, ctx_docs)
    jr = judge_relevance(q, pred)
    js = judge_semantic_accuracy(q, pred, gt_answers)
    jc = judge_citation(pred, used_ids, must_id)

    # 4) 総合判定（重みは必要に応じて変更可）
    #    ここでは「意味的正答」「忠実性」「引用妥当」「関連性」のAND近似
    overall = int(js["semantic_acc"] == 1 and jf["faithfulness_score"] == 1 and jc["citation_ok"] == 1 and jr["relevance_score"] == 1)

    records.append({
        "qid": (ex.metadata or {}).get("qid", ""),
        "question": q,
        "prediction": pred,
        "answers": "; ".join(gt_answers),
        "used_source_ids": ",".join(used_ids),
        "must_have_source_id": must_id,

        # ルールベース
        "is_correct_rule": int(correct_rule),
        "has_must_source": int(source_ok),

        # Judge
        "faithfulness": jf["faithfulness_score"],
        "relevance": jr["relevance_score"],
        "semantic_acc": js["semantic_acc"],
        "citation_ok": jc["citation_ok"],

        # 総合
        "overall_pass": overall,

        # optionally: judge raw も短縮保存（長文の場合はCSVが膨らむのでコメントアウト可）
        # "faithfulness_raw": jf["faithfulness_raw"],
        # "relevance_raw": jr["relevance_raw"],
        # "semantic_acc_raw": js["semantic_acc_raw"],  # ←返却キー名が違うので注意（上を変える場合は合わせる）
        "cited_in_text": ";".join(jc["cited_in_text"]),
    })

df_eval_judge = pd.DataFrame.from_records(records)

# 集計
def _mean(col):
    return df_eval_judge[col].mean() if len(df_eval_judge) else 0.0

print("\n=== Judge集計 ===")
print(f"Rule-based 正答率           : {_mean('is_correct_rule'):.3f}")
print(f"Rule-based 必須ソース一致率 : {_mean('has_must_source'):.3f}")
print(f"Semantic Accuracy (LLM)      : {_mean('semantic_acc'):.3f}")
print(f"Faithfulness (LLM)           : {_mean('faithfulness'):.3f}")
print(f"Relevance (LLM)              : {_mean('relevance'):.3f}")
print(f"Citation OK                  : {_mean('citation_ok'):.3f}")
print(f"Overall Pass                 : {_mean('overall_pass'):.3f}")

# 失敗ケース表示（Overallで落ちたもの）
df_bad = df_eval_judge[df_eval_judge["overall_pass"]==0].copy()
display_cols = [
    "qid","question","prediction","answers",
    "used_source_ids","must_have_source_id",
    "is_correct_rule","semantic_acc","faithfulness","relevance","citation_ok","overall_pass",
    "cited_in_text"
]
print("\n=== Overall NG（上位10件）===")
display(df_bad[display_cols].head(10))

# 保存
timestamp = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
out_csv = f"{RESULTS_DIR}/offline_eval_with_judge_{timestamp}.csv"
df_eval_judge.to_csv(out_csv, index=False, encoding="utf-8")
print("結果CSV:", out_csv)

評価対象 Examples: 5


  0%|          | 0/5 [00:00<?, ?it/s]


=== Judge集計 ===
Rule-based 正答率           : 1.000
Rule-based 必須ソース一致率 : 0.800
Semantic Accuracy (LLM)      : 1.000
Faithfulness (LLM)           : 1.000
Relevance (LLM)              : 1.000
Citation OK                  : 0.800
Overall Pass                 : 0.800

=== Overall NG（上位10件）===


Unnamed: 0,qid,question,prediction,answers,used_source_ids,must_have_source_id,is_correct_rule,semantic_acc,faithfulness,relevance,citation_ok,overall_pass,cited_in_text
4,Q001,サクラ短期国債ファンドの信託報酬は年率いくら？,- 回答: サクラ短期国債ファンドの信託報酬は年率0.18%です。\n- 参照:\n - ...,0.18%; 年率0.18%,"DOC001,DOC007",DOC006,1,1,1,1,0,0,DOC001


結果CSV: /content/drive/MyDrive/Colab Notebooks/rag_week1/results/offline_eval_with_judge_20251106-082414.csv


In [63]:
from langchain_classic.retrievers.multi_query import MultiQueryRetriever
from langchain_classic.retrievers import ContextualCompressionRetriever
from langchain_classic.retrievers.document_compressors import EmbeddingsFilter
from langchain_openai import ChatOpenAI

# 1) MMR（多様性を確保）で土台のRetrieverを作る
retriever_mmr = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 8, "lambda_mult": 0.5}  # k/λは後で調整可能
)

# 2) Multi-Query（質問を複数パラフレーズ）
#    すでに judge_llm or llm があるはずですが、明示的に安定設定
mq_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=retriever_mmr,
    llm=mq_llm,
    include_original=True
)

# 3) コンテキスト圧縮（埋め込み類似度で不要チャンクを落とす）
compressor = EmbeddingsFilter(embeddings=embeddings, similarity_threshold=0.35)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=multiquery_retriever
)

print("[OK] Improved retriever ready (Multi-Query + MMR + Compression)")

[OK] Improved retriever ready (Multi-Query + MMR + Compression)


In [64]:
import textwrap

IMPROVED_SYSTEM_PROMPT = """あなたは投資信託と社内規程ドキュメントに基づいて事実に忠実に回答するアシスタントです。
- 回答は日本語で、簡潔・正確に。
- 必ず与えられたコンテキスト（抜粋）に基づいて回答し、根拠となった文書IDとタイトルを列挙してください。
- 不明な場合は「不明」と答えてください。"""

IMPROVED_HUMAN_PROMPT = """質問:
{question}

# コンテキスト（厳選抜粋）
{context}

# 出力フォーマット
- 回答: <短いテキスト>
- 参照:
  - <DOCID>: <タイトル>
  - ...
"""

improved_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
improved_prompt = ChatPromptTemplate.from_messages(
    [("system", IMPROVED_SYSTEM_PROMPT), ("human", IMPROVED_HUMAN_PROMPT)]
)

def format_ctx_short(docs) -> str:
    parts = []
    for i, d in enumerate(docs, 1):
        m = (getattr(d, "metadata", {}) or {})
        parts.append(
            f"[{i}] DOCID={m.get('doc_id','')} TITLE={m.get('title','')}\n"
            + textwrap.shorten((getattr(d, "page_content", "") or "").replace("\n"," "),
                               width=500, placeholder=" ...")
        )
    return "\n\n".join(parts)

def run_rag_improved(question: str):
    # retrieverはRunnable対応なので invoke 優先
    if hasattr(compression_retriever, "invoke"):
        ctx_docs = compression_retriever.invoke(question)
    else:
        ctx_docs = compression_retriever.get_relevant_documents(question)
    try:
        ctx_docs = list(ctx_docs)
    except TypeError:
        pass

    ctx_text = format_ctx_short(ctx_docs)
    msg = improved_prompt.format_messages(question=question, context=ctx_text)
    resp = improved_llm.invoke(msg)
    answer = resp.content.strip()

    source_ids = []
    for d in ctx_docs:
        did = (getattr(d, "metadata", {}) or {}).get("doc_id")
        if did and did not in source_ids:
            source_ids.append(did)

    return {"answer": answer, "source_ids": source_ids, "ctx_docs": ctx_docs}

In [65]:
import datetime
from tqdm.auto import tqdm
import pandas as pd

def evaluate_variant(run_fn, variant_name: str):
    rows = []
    examples = list(client.list_examples(dataset_id=dataset.id))
    for ex in tqdm(examples, desc=f"Evaluating {variant_name}"):
        q = ex.inputs.get("question", "")
        gold = ex.outputs.get("answers", [])
        must_id = (ex.metadata or {}).get("must_have_source_id", "")

        out = run_fn(q)
        pred = out["answer"]
        used = out["source_ids"]
        ctx_docs = out["ctx_docs"]

        # ルールベース
        rule_ok = any_match(pred, gold)
        src_ok = must_source_ok(used, must_id)

        # Judge（フェーズ5関数を再利用）
        jf = judge_faithfulness(pred, ctx_docs)
        jr = judge_relevance(q, pred)
        js = judge_semantic_accuracy(q, pred, gold)
        jc = judge_citation(pred, used, must_id)

        overall = int(js["semantic_acc"]==1 and jf["faithfulness_score"]==1 and
                      jr["relevance_score"]==1 and jc["citation_ok"]==1)

        rows.append({
            "variant": variant_name,
            "qid": (ex.metadata or {}).get("qid", ""),
            "question": q,
            "prediction": pred,
            "answers": "; ".join(gold),
            "used_source_ids": ",".join(used),
            "must_have_source_id": must_id,
            "is_correct_rule": int(rule_ok),
            "has_must_source": int(src_ok),
            "semantic_acc": js["semantic_acc"],
            "faithfulness": jf["faithfulness_score"],
            "relevance": jr["relevance_score"],
            "citation_ok": jc["citation_ok"],
            "overall_pass": overall,
        })
    return pd.DataFrame(rows)

# Baseline（既存 run_rag）と Improved（新 run_rag_improved）
df_base = evaluate_variant(run_rag, "Baseline")
df_impr = evaluate_variant(run_rag_improved, "Improved")

# マージして見やすく
key_cols = ["qid","question","answers","must_have_source_id"]
score_cols = ["is_correct_rule","has_must_source","semantic_acc","faithfulness","relevance","citation_ok","overall_pass"]

df_merged = (df_base[key_cols + score_cols]
             .merge(df_impr[key_cols + score_cols], on=key_cols, suffixes=("_base","_impr")))

# 改善差分
for c in score_cols:
    df_merged[f"delta_{c}"] = df_merged[f"{c}_impr"] - df_merged[f"{c}_base"]

# サマリー
summary = {
    "Baseline": {c: df_base[c].mean() for c in score_cols},
    "Improved": {c: df_impr[c].mean() for c in score_cols},
    "Delta(Imp-Base)": {c: df_impr[c].mean() - df_base[c].mean() for c in score_cols},
}
print("\n=== A/B Summary (mean) ===")
for k,v in summary.items():
    print(k, {m: round(x,3) for m,x in v.items()})

# 保存
ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
out_ab_csv = f"{RESULTS_DIR}/ab_compare_{ts}.csv"
out_base_csv = f"{RESULTS_DIR}/baseline_{ts}.csv"
out_impr_csv = f"{RESULTS_DIR}/improved_{ts}.csv"
df_merged.to_csv(out_ab_csv, index=False, encoding="utf-8")
df_base.to_csv(out_base_csv, index=False, encoding="utf-8")
df_impr.to_csv(out_impr_csv, index=False, encoding="utf-8")
print("CSV出力:")
print(" -", out_ab_csv)
print(" -", out_base_csv)
print(" -", out_impr_csv)

# 失敗ケース（Improvedでまだ落ちるもの）を確認
bad_cols = ["qid","question","prediction_base","prediction_impr","answers","must_have_source_id",
            "is_correct_rule_base","is_correct_rule_impr","semantic_acc_base","semantic_acc_impr",
            "faithfulness_base","faithfulness_impr","relevance_base","relevance_impr",
            "citation_ok_base","citation_ok_impr","overall_pass_base","overall_pass_impr"]
tmp = df_merged.copy()
# 予測文を持ってくる
tmp = (tmp
       .merge(df_base[["qid","prediction"]].rename(columns={"prediction":"prediction_base"}), on="qid")
       .merge(df_impr[["qid","prediction"]].rename(columns={"prediction":"prediction_impr"}), on="qid"))
bad = tmp[tmp["overall_pass_impr"]==0]
print("\n=== ImprovedでNGのケース（上位10）===")
display(bad[bad_cols].head(10))

Evaluating Baseline:   0%|          | 0/5 [00:00<?, ?it/s]

Evaluating Improved:   0%|          | 0/5 [00:00<?, ?it/s]


=== A/B Summary (mean) ===
Baseline {'is_correct_rule': np.float64(1.0), 'has_must_source': np.float64(0.8), 'semantic_acc': np.float64(1.0), 'faithfulness': np.float64(1.0), 'relevance': np.float64(1.0), 'citation_ok': np.float64(0.8), 'overall_pass': np.float64(0.8)}
Improved {'is_correct_rule': np.float64(1.0), 'has_must_source': np.float64(1.0), 'semantic_acc': np.float64(1.0), 'faithfulness': np.float64(0.8), 'relevance': np.float64(1.0), 'citation_ok': np.float64(1.0), 'overall_pass': np.float64(0.8)}
Delta(Imp-Base) {'is_correct_rule': np.float64(0.0), 'has_must_source': np.float64(0.2), 'semantic_acc': np.float64(0.0), 'faithfulness': np.float64(-0.2), 'relevance': np.float64(0.0), 'citation_ok': np.float64(0.2), 'overall_pass': np.float64(0.0)}
CSV出力:
 - /content/drive/MyDrive/Colab Notebooks/rag_week1/results/ab_compare_20251106-083324.csv
 - /content/drive/MyDrive/Colab Notebooks/rag_week1/results/baseline_20251106-083324.csv
 - /content/drive/MyDrive/Colab Notebooks/rag_we

Unnamed: 0,qid,question,prediction_base,prediction_impr,answers,must_have_source_id,is_correct_rule_base,is_correct_rule_impr,semantic_acc_base,semantic_acc_impr,faithfulness_base,faithfulness_impr,relevance_base,relevance_impr,citation_ok_base,citation_ok_impr,overall_pass_base,overall_pass_impr
1,Q004,TOPIXに連動するインデックスファンドの信託財産留保額は？,回答: 信託財産留保額は0.1%です。\n参照:\n- DOC002: 投資信託_株式インデ...,- 回答: 0.1%\n- 参照:\n - DOC002: 投資信託_株式インデックス_商...,0.1%; 0.1％,DOC002,1,1,1,1,1,0,1,1,1,1,1,0


In [66]:
from langsmith.evaluation import evaluate
from langsmith.schemas import Run, Example

# --- あなたの既存RAG関数を評価用に薄くラップ ---
def predict_baseline(inputs: dict) -> dict:
    q = inputs["question"]
    out = run_rag(q)  # 既存（フェーズ4/5）
    return {"output": out["answer"], "source_ids": out["source_ids"]}

def predict_improved(inputs: dict) -> dict:
    q = inputs["question"]
    out = run_rag_improved(q)  # フェーズ6で作成
    return {"output": out["answer"], "source_ids": out["source_ids"]}

In [67]:
# すでにフェーズ4で作った正規化関数を使う
def rule_accuracy(run: Run, example: Example):
    pred = run.outputs.get("output", "")
    gold = example.outputs.get("answers", [])
    return {"key": "rule_accuracy", "score": 1 if any_match(pred, gold) else 0}

def must_source(run: Run, example: Example):
    used = run.outputs.get("source_ids", [])
    must_id = (example.metadata or {}).get("must_have_source_id", "")
    ok = 1 if must_source_ok(used, must_id) else 0
    return {"key": "must_source", "score": ok}

# LLM-as-a-Judge（フェーズ5の関数をrunに適用できる形に）
def judge_semantic(run: Run, example: Example):
    q = example.inputs.get("question", "")
    pred = run.outputs.get("output", "")
    gold = example.outputs.get("answers", [])
    r = judge_semantic_accuracy(q, pred, gold)
    return {"key": "semantic_acc_llm", "score": int(r["semantic_acc"] == 1)}

def judge_faithful(run: Run, example: Example):
    # 取得コンテキストは run.outputs に入れていないので、ここでは回答本文の整合性のみ軽く確認。
    # より厳密にやるなら、predict_* が ctx_docs を outputs に含める設計に変更すると良いです。
    # 例: return {"output": out["answer"], "source_ids": out["source_ids"], "ctx": ctx_to_text(out["ctx_docs"])}
    pred = run.outputs.get("output", "")
    # ctxなしの簡易版: 回答内のDOCID出現有無を弱い忠実性シグナルにする例（任意）
    has_docid = 1 if __import__("re").search(r"\bDOC\d{3,}\b", pred) else 0
    return {"key": "faithfulness_proxy", "score": has_docid}

In [71]:
client = Client()

# Baseline
res_base = evaluate(
    predict_baseline,
    data=dataset_name,
    evaluators=[rule_accuracy, must_source, judge_semantic, judge_faithful],
    experiment_prefix="agent-book_baseline",   # 実験名のプレフィックス
    metadata={"variant": "baseline"},
    client=client,
)
print("Baseline Experiment:", res_base.experiment_name)

# Improved
res_impr = evaluate(
    predict_improved,
    data=dataset_name,
    evaluators=[rule_accuracy, must_source, judge_semantic, judge_faithful],
    experiment_prefix="agent-book_improved",
    metadata={"variant": "improved"},
    client=client,
)
print("Improved Experiment:", res_impr.experiment_name)

View the evaluation results for experiment: 'agent-book_baseline-b77ef914' at:
https://smith.langchain.com/o/6f56e7f7-72dd-49b5-8b34-8cb545c1a816/datasets/2b71b55b-c4c1-4506-b8b8-91808b2bb8f8/compare?selectedSessions=210f5140-b32c-4146-8ff2-2ea541bee38f




0it [00:00, ?it/s]

Baseline Experiment: agent-book_baseline-b77ef914
View the evaluation results for experiment: 'agent-book_improved-0b2e2d94' at:
https://smith.langchain.com/o/6f56e7f7-72dd-49b5-8b34-8cb545c1a816/datasets/2b71b55b-c4c1-4506-b8b8-91808b2bb8f8/compare?selectedSessions=0372aa39-87be-4c8a-8dfb-4331d52e37c0




0it [00:00, ?it/s]

Improved Experiment: agent-book_improved-0b2e2d94


In [72]:
# 実験の集計をSDKで取得（UIと同等のメトリクス）
exp_base   = client.get_experiment_results(res_base.experiment_name)
exp_impr   = client.get_experiment_results(res_impr.experiment_name)
# （DataFrameで取るユーティリティもあります）

In [73]:
from langsmith import Client, trace

client = Client()

# 例: 任意の処理をトレースして run_id / trace_id を得る
with trace(name="rag_inference", inputs={"question": "..."}) as root:
    # ... RAG を実行して root.outputs に格納 ...
    root.outputs = {"answer": "...", "source_ids": ["DOC008"]}

trace_id = root.id          # ルートトレースID
some_run_id = root.id       # 子runに付けたい場合は子run.idを拾う

# 👍（Good）
client.create_feedback(
    key="thumbs_up", score=1, trace_id=trace_id, comment="ユーザーが満足"
)

# 👎（Bad）
client.create_feedback(
    key="thumbs_down", score=0, trace_id=trace_id, comment="要改善：根拠が弱い"
)

Feedback(id=UUID('a6dce08e-6ef0-4a4c-b6fe-aedb9b6c8ea3'), created_at=datetime.datetime(2025, 11, 6, 9, 0, 42, 577043, tzinfo=datetime.timezone.utc), modified_at=datetime.datetime(2025, 11, 6, 9, 0, 42, 577049, tzinfo=datetime.timezone.utc), run_id=UUID('02746967-7ce0-46de-a95f-005a1ee92663'), trace_id=UUID('02746967-7ce0-46de-a95f-005a1ee92663'), key='thumbs_down', score=0, value=None, comment='要改善：根拠が弱い', correction=None, feedback_source=FeedbackSourceBase(type='api', metadata={}, user_id=None, user_name=None), session_id=None, comparative_experiment_id=None, feedback_group_id=None, extra=None)