### NER-augmented RAG

ollama llm 모델 클래스 정의

In [5]:
# ==========================
# Step 0.5: OllamaLLM 정의
# ==========================
import requests
from langchain.llms.base import LLM
from typing import Optional, List, Any

class OllamaLLM(LLM):
    model_name: str = "gpt-oss"
    base_url: str = "http://localhost:11434"
    timeout: int = 300  # 모델 크기 때문에 충분히 넉넉히 설정
    
    def _call(
        self, 
        prompt: str, 
        stop: Optional[List[str]] = None,
        run_manager: Optional[Any] = None
    ) -> str:
        url = f"{self.base_url}/api/generate"
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False
        }
        
        try:
            response = requests.post(url, json=payload, timeout=self.timeout)
            response.raise_for_status()
            return response.json()['response']
        except requests.Timeout:
            raise RuntimeError(f"Ollama timed out after {self.timeout}s")
        except Exception as e:
            raise RuntimeError(f"Ollama API error: {str(e)}")
    
    @property
    def _llm_type(self) -> str:
        return "ollama"

print("Step 0.5: OllamaLLM 클래스 정의 완료")


Step 0.5: OllamaLLM 클래스 정의 완료


PDF → RAG → BioNER → 후처리 → 최종 drug list → 정답셋 대비 precision 계산

In [7]:
# ==========================
# Step 1: 라이브러리 로드
# ==========================
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import re

# ==========================
# Step 2: PDF 로드
# ==========================
pdf_path = "/data1/workspace/pdfs/5.pdf"  # PDF 경로
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

# ==========================
# Step 3: 텍스트 Chunk 분할
# ==========================
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

# Step 4: 임베딩 + FAISS 벡터스토어
# ==========================
embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)
retriever = vectorstore.as_retriever()



In [8]:
# ==========================
# Step 5: BioNER 모델 로드
# ==========================
ner_model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
model = AutoModelForTokenClassification.from_pretrained(
    ner_model_name,
    device_map="auto"
)

ner_pipe = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

# ==========================
# Step 6: NER 추출 + WordPiece 후처리 함수
# ==========================
def extract_drugs(text, ner_pipe, min_score=0.9):
    ner_results = ner_pipe(text)
    # WordPiece 합치기
    drugs = []
    current_word = ""
    for token in ner_results:
        if token['entity_group'] == 'Medication' and token['score'] >= min_score:
            if token['word'].startswith("##"):
                current_word += token['word'][2:]
            else:
                if current_word:
                    drugs.append(current_word)
                current_word = token['word']
    if current_word:
        drugs.append(current_word)
    # 중복 제거
    drugs = list(set(drugs))
    return drugs

# 테스트
test_text = split_documents[0].page_content
drugs_in_chunk = extract_drugs(test_text, ner_pipe)
# print("Step 6: 첫 번째 Chunk에서 추출된 drug names:", drugs_in_chunk)

# ==========================
# Step 6: NER 추출 + WordPiece 후처리 함수
# ==========================
def extract_drugs(text, ner_pipe, min_score=0.9):
    ner_results = ner_pipe(text)
    # WordPiece 합치기
    drugs = []
    current_word = ""
    for token in ner_results:
        if token['entity_group'] == 'Medication' and token['score'] >= min_score:
            if token['word'].startswith("##"):
                current_word += token['word'][2:]
            else:
                if current_word:
                    drugs.append(current_word)
                current_word = token['word']
    if current_word:
        drugs.append(current_word)
    # 중복 제거
    drugs = list(set(drugs))
    return drugs

# 테스트
test_text = split_documents[0].page_content
drugs_in_chunk = extract_drugs(test_text, ner_pipe)
# print("Step 6: 첫 번째 Chunk에서 추출된 drug names:", drugs_in_chunk)


Device set to use cuda:0


In [None]:
# ==========================
# Step 7: 전체 PDF에서 Drug 추출
# ==========================
all_drugs = []
for i, chunk in enumerate(split_documents):
    chunk_drugs = extract_drugs(chunk.page_content, ner_pipe)
    all_drugs.extend(chunk_drugs)
all_drugs = list(set(all_drugs))

# ==========================
# Step 8: RAG + OllamaLLM 연동 (정리)
# ==========================
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# LLM 객체 생성
llm = OllamaLLM(model_name="gpt-oss", temperature=0)
# print("Step 8: OllamaLLM 객체 생성 완료")

# Prompt 정의
prompt_template = """
You are a biomedical text analysis assistant.

Your task is to extract **only the drugs that were actually tested, administered, or part of the experiments** in the provided document chunks.
Do not include drugs that are mentioned only in the background, references, or comparison sections.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Guidelines:
- Extract drugs mentioned in the 'Results' or 'Methods' sections preferably.
- Include drugs mentioned in Figures and Tables if they were part of the experiments.
- Exclude gene names, proteins, pathways, and drugs only cited from literature.
- Merge WordPiece fragments into full drug names.
- Remove duplicates.
- List up to 3 most relevant drugs if more are found.
- Output as a semicolon-separated list.

Answer:
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context"]
)

# RetrievalQA 체인 생성
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True,
    verbose=True
)
# print("Step 8: RetrievalQA 체인 생성 완료")

# PDF 전체 text 합치기
all_text = " ".join([chunk.page_content for chunk in split_documents])
# print("Step 8: 모든 chunk 합치기 완료, 길이:", len(all_text))

# RAG + LLM 실행
response = chain({"query": all_text})
rag_drugs = response['result']
print("Step 8: RAG + LLM로 추출된 Drug Names:\n", rag_drugs)


Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")



[1m> Finished chain.[0m
Step 8: RAG + LLM로 추출된 Drug Names:
 mAb MDR1-modified chitosan nanoparticles; liensinine; Acetyl-bufalin


프롬트 수정 예시

프롬트 1: 괘 괜찮음, 근데 약물 3개도 많아서 1로 개선함


In [None]:
prompt_template = """
You are a biomedical text analysis assistant.

Your task is to extract **only the drugs that were actually tested, administered, or part of the experiments** in the provided document chunks.
Do not include drugs that are mentioned only in the background, references, or comparison sections.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Guidelines:
- Extract drugs mentioned in the 'Results' or 'Methods' sections preferably.
- Include drugs mentioned in Figures and Tables if they were part of the experiments.
- Exclude gene names, proteins, pathways, and drugs only cited from literature.
- Merge WordPiece fragments into full drug names.
- Remove duplicates.
- List up to 3 most relevant drugs if more are found.
- Output as a semicolon-separated list.

Answer:
"""

프롬트2: 실제 약물 위주, 안 쓰는 프롬프트

In [None]:
prompt_template = """
You are a biomedical text analysis assistant.

Your task is to extract **only the drugs that were actually tested, administered, or part of the experiments** in the provided document chunks.
Do not include drugs that are mentioned only in the background, discussion, references, or comparison sections.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Guidelines:
- Extract drugs mentioned in the 'Results' or 'Methods' sections preferably.
- Include drugs mentioned in Figures and Tables only if they were used in the main experiments.
- Exclude drugs mentioned only as examples, background information, or in cited papers.
- Exclude gene names, proteins, signaling pathways, or assay reagents (e.g., DMSO, PBS, MTT).
- If the text describes a combination therapy, include all drugs that were co-administered.
- Merge WordPiece fragments into complete drug names.
- Remove duplicates.
- List up to **3 most relevant drugs** directly used in experiments.
- Verify that each extracted term is an **actual drug name or formulation**, not a biological target or herbal component.
- Output as a **semicolon-separated list**, without any extra words or explanations.

Answer:
"""


프롬트3: 1개 약물, 최대3까지 

In [None]:
prompt_template = """
You are a biomedical text analysis assistant.

Your goal is to extract **only the drugs that were experimentally tested, administered, or directly used in the study**.
Focus on precision — if uncertain, do not include the drug.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Guidelines:
- Prefer drugs explicitly described as being *tested*, *treated*, *administered*, or *used* in the experiments.
- Exclude drugs mentioned only in background, discussion, or references.
- Ignore drugs that appear as examples, related compounds, or comparative mentions unless they were actually used.
- Merge WordPiece fragments into full drug names.
- Remove duplicates.
- Extract **at least 1** and **at most 3** drug names.
# - If no clear experimental drugs are found, return "None".
- Output only the extracted drug names, separated by semicolons (;), with no extra text or explanation.

Answer:
"""


프롬트4 :none 나올경우(어차피 프리시젼0이니), 하나만 : vision 계열

In [None]:
prompt_template = """
You are a biomedical text analysis assistant.

Extract drugs that were actually tested or administered in experiments from the following document chunks.
Prefer drugs mentioned in 'Results' or 'Methods' sections and in Figures/Tables.
Exclude drugs mentioned only in background, references, or literature.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

- If no explicit experimental drug names are found, infer the **most likely single drug** that was experimentally tested, based on context clues such as "treatment", "administration", or "dose".
- Merge WordPiece fragments into full drug names
- Remove duplicates
- Output up to 3 most relevant drugs (preferably 1 strong candidate)
- Output as a semicolon-separated list (e.g., cisplatin; gefitinib)

Answer:
"""


완전 한개만.. (오탐 줄임)

In [None]:
prompt_template = """
You are a biomedical text analysis assistant.

Extract **only the single most relevant experimental drug** that was actually tested or administered in the study.
Prefer drugs mentioned in 'Results' or 'Methods' sections and in Figures/Tables.
Exclude drugs mentioned only in background, references, or literature.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Guidelines:
- If multiple drugs are mentioned, choose **the one most central to the experiment**.
- If no explicit experimental drug name is found, infer **one likely candidate** based on context clues like "treatment", "administration", or "dose".
- Merge WordPiece fragments into a complete drug name.
- Output exactly **one drug name** (no lists, no punctuation).

Answer: 
 """

1~3개인데, 멀티모달 극단적으로 1개로 가야함. 정답률 낮음

In [None]:
prompt_template = """
You are a biomedical text analysis assistant specialized in identifying **experimentally validated drug names**.

Your task is to extract only drug names that were **directly tested, administered, or dosed** in the experiments described below.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Extraction Rules:
1. Include a drug **only if** there is experimental evidence such as:
   - Dosage (e.g., mg/kg, µM, concentration)
   - Treatment phrases (e.g., "treated with", "administered", "injected", "given", "dose of", "exposed to")
   - Comparative groups (e.g., "control vs. cisplatin-treated")
   - Drug combination explicitly tested together
2. Exclude drugs mentioned only as:
   - Background, literature, hypotheses, or reference drugs
   - Future directions or discussion
3. Prefer extracting **only one** main experimental drug name.
4. Add **up to 2 additional drugs** only if they are clearly and quantitatively involved in the same experiment (e.g., combination therapy).
5. Merge fragmented tokens into full drug names (e.g., "gefi" + "tinib" → "gefitinib").
6. Remove duplicates and unrelated items.
7. If no experimentally verified drug names are found, output “None”.

Output Format:
- 1–3 drug names, separated by semicolons (`;`)
- Preferably 1 strong candidate if the study focuses on a single agent
- Example: cisplatin; gefitinib

Answer:
"""

비전모델: 3개 뽑길래, 최대1개, 그다음3개

In [None]:
prompt_template = """
You are a biomedical text analysis assistant.

Extract the **single most likely experimental drug** that was actually tested or administered in the following document chunks.
Prefer drugs mentioned in 'Results' or 'Methods' sections, or in Figures/Tables.
Exclude drugs mentioned only in background, references, or literature.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Instructions:
- Prioritize identifying **one primary experimental drug**.
- Output **only one drug** unless there is clear and explicit evidence of **multiple (up to 3) different experimental drugs** tested.
- Merge WordPiece fragments into full drug names.
- Remove duplicates and spurious terms (e.g., buffer, saline, vehicle).
- If uncertain, choose the **most likely single candidate** based on treatment, administration, or dosage context.
- Output as a semicolon-separated list (e.g., cisplatin; gefitinib)

Answer:
"""


비전모델 프롬트: 완전 1개 

In [None]:
prompt_template = """
You are a biomedical text analysis assistant.

Your task is to extract **exactly one** experimental drug name that was actually tested or administered in the following document excerpt.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Instructions:
- Extract **exactly one** drug name — not two, not three, only one.
- Choose the **single most likely experimental drug** based on the context of treatment, administration, or dosage.
- If multiple drugs are mentioned, select the **main or most central one** (e.g., the primary treatment or intervention).
- Exclude drugs mentioned only in background, literature review, or control conditions.
- Do not include vehicles, buffers, or solutions (e.g., saline, DMSO, PBS).
- Merge WordPiece fragments into full drug names.
- If uncertain, make your **best single guess** rather than listing multiple possibilities.
- Output **only the drug name** — no lists, no punctuation, no explanations.

Answer:
"""
