## 검증 테스트 ##
llm RAG 답변 생성 이후에 검증을 진행하는 것은 현실적으로 어려움이 있어 가능한 수준의 테스트를 진행했습니다. llm RAG 답변 생성시 하드웨어 이슈로 한 개의 질문 당 몇십 분이 소요되어 100개 이상의 데이터셋에서 작동하는 것이 불가능했습니다. 이에, 이번 연구의 파이프라인을 참고하여 1. 사용자 질문에서 pubmed qeury에 들어갈 단어를 잘 읽어내는지(pubmed는 논문 db로 검색어에 실제 논문과 일치하는 문장을 포함해야 반환하므로, 일반 검색 엔진처럼 전체 질문 쿼리를 삽입할 수 없습니다. 이에 사용자 질문에서 pubmed용 쿼리(NER 했을 때의 disease 혹은 gene)를 뽑아내고, 이를 통해 Pubmed 논문을 검색한 뒤 전체 질문과 함께 이용해 semantic search하는 방식으로 구현하였습니다.), pubmed용 쿼리를 토대로 검색했을때 top-k 문장이 타당한지를 검토한 테스트입니다. 

In [2]:
import json
import sys
from pathlib import Path
from time import time

ROOT = Path("..").resolve()
sys.path.append(str(ROOT))

from src.retriever import retrieve_top_k_sentences, build_pubmed_query_from_question

[nltk_data] Downloading package punkt to /Users/eunseo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [8]:
TEST_FILE = Path("../data/test_queries.jsonl")

def load_tests(path: Path):
    tests = []
    with path.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            tests.append(json.loads(line))
    return tests

def check_expect_in_query(pubmed_query: str, expect_tokens):
    q_lower = pubmed_query.lower()
    return all(tok.lower() in q_lower for tok in expect_tokens)


def check_expect_in_results(rag_results, expect_keywords):
    texts = " ".join(r["text"] for r in rag_results).lower()
    return any(kw.lower() in texts for kw in expect_keywords)

In [9]:
def run_tests(retmax=20, k=5, max_tests=None, verbose=True):
    tests = load_tests(TEST_FILE)
    if max_tests is not None:
        tests = tests[:max_tests]

    summary = []
    t0 = time()

    for t in tests:
        qid = t["id"]
        question = t["question"]
        expect_q = t.get("expect_in_query", [])
        expect_r = t.get("expect_in_results_any", [])

        if verbose:
            print("=" * 80)
            print(f"[{qid}] Q: {question}")

        # 1) PubMed query 확인
        pubmed_query = build_pubmed_query_from_question(question)
        ok_query = check_expect_in_query(pubmed_query, expect_q)
        if verbose:
            print("PubMed query:", pubmed_query)
            print("  - query contains expected tokens:", ok_query, "| expected:", expect_q)

        # 2) RAG 검색 실행
        rag_results, all_sentences, used_query = retrieve_top_k_sentences(
            question,
            retmax=retmax,
            k=k,
        )

        if not rag_results:
            ok_results = False
            if verbose:
                print("  - NO RESULTS from PubMed")
        else:
            ok_results = check_expect_in_results(rag_results, expect_r)
            if verbose:
                print(f"Top-{k} sample sentences:")
                for r in rag_results:
                    print(f"   [{r['rank']}] score={r['score']:.4f} | {r['text'][:200]}")
                print("  - results contain expected keywords:", ok_results, "| expected:", expect_r)

        summary.append(
            {
                "id": qid,
                "question": question,
                "query_ok": ok_query,
                "results_ok": ok_results,
            }
        )

    elapsed = time() - t0

    # ----- 집계 통계 -----
    total = len(summary)
    q_ok = sum(1 for s in summary if s["query_ok"])
    r_ok = sum(1 for s in summary if s["results_ok"])
    both_ok = sum(1 for s in summary if s["query_ok"] and s["results_ok"])

    query_acc = q_ok / total if total else 0.0
    results_acc = r_ok / total if total else 0.0
    pipeline_acc = both_ok / total if total else 0.0

    print("=" * 80)
    print(f"Total tests          : {total}")
    print(f"PubMed query OK      : {q_ok}/{total} ({query_acc*100:.1f}%)")
    print(f"Top-{k} results OK    : {r_ok}/{total} ({results_acc*100:.1f}%)")
    print(f"Both query+results OK: {both_ok}/{total} ({pipeline_acc*100:.1f}%)")
    print(f"Elapsed time         : {elapsed:.1f} s "
          f"(avg {elapsed/total:.2f} s/query)")

In [11]:
run_tests(retmax=20, k=5, max_tests=100, verbose=False)

pubmed query: ("BRCA1"[Title/Abstract])
pubmed query: ("BRCA1"[Title/Abstract]) AND ("hereditary breast cancer"[Title/Abstract])
pubmed query: ("of"[Title/Abstract] OR "BRCA1"[Title/Abstract]) AND ("ovarian cancer"[Title/Abstract])
pubmed query: ("BRCA1"[Title/Abstract]) AND ("cancers"[Title/Abstract])
pubmed query: ("BRCA1"[Title/Abstract]) AND ("prostate cancer"[Title/Abstract])
pubmed query: ("BRCA1"[Title/Abstract] OR "in"[Title/Abstract]) AND ("pancreatic cancer"[Title/Abstract])
pubmed query: ("BRCA1"[Title/Abstract] OR "tube"[Title/Abstract]) AND ("fallopian tube cancer"[Title/Abstract])
pubmed query: ("BRCA1"[Title/Abstract]) AND ("peritoneal cancer"[Title/Abstract])
pubmed query: ("Do"[Title/Abstract] OR "BRCA1"[Title/Abstract]) AND ("breast cancer"[Title/Abstract])
pubmed query: ("BRCA1"[Title/Abstract]) AND ("non-breast cancers"[Title/Abstract])
pubmed query: ("BRCA2"[Title/Abstract])
pubmed query: ("BRCA2"[Title/Abstract]) AND ("ovarian cancer"[Title/Abstract] OR "hereditar