### RAG-LangChain llm

jpeg 파일 따로 안 만들고 cache에 저장, 다른 모델 재현시 갖다씀

In [6]:
import os
import requests
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms.base import LLM
from typing import Optional, List, Any

# ==============================
# 0️⃣ 설정
# ==============================
OLLAMA_URL = "http://localhost:11434"
VISION_MODEL = "llava"
LLM_MODEL = "gpt-oss"
EMBED_MODEL = "nomic-embed-text"
pdf_path = "/data1/workspace/pdfs/1.pdf"

# ==============================
# 1️⃣ PDF → 문서 로드
# ==============================
print("[1] PDF 로드 중...")
loader = PyMuPDFLoader(pdf_path)
docs = loader.load()

# ==============================
# 2️⃣ 텍스트 분할
# ==============================
print("[2] 문서 분할 중...")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = splitter.split_documents(docs)

# ==============================
# 3️⃣ 임베딩 생성
# ==============================
print("[3] 임베딩 생성 중...")
embeddings = OllamaEmbeddings(model=EMBED_MODEL, base_url=OLLAMA_URL)
vectorstore = FAISS.from_documents(splits, embedding=embeddings)
retriever = vectorstore.as_retriever()

# ==============================
# 4️⃣ Vision 요약 함수
# ==============================
def extract_and_summarize_figures(pdf_path, model=VISION_MODEL):
    print("[4] Vision 요약 중...")
    import fitz  # PyMuPDF
    from PIL import Image
    import io, base64

    summaries = []
    doc = fitz.open(pdf_path)

    for page_idx, page in enumerate(doc):
        image_list = page.get_images(full=True)
        for img_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # 이미지 base64로 변환
            buffered = io.BytesIO()
            image.save(buffered, format="JPEG")
            img_b64 = base64.b64encode(buffered.getvalue()).decode()

            # Vision 모델 호출
            payload = {
                "model": model,
                "prompt": "Describe any figures, tables, or data shown in this image briefly.",
                "images": [img_b64],
                "stream": False
            }
            try:
                response = requests.post(f"{OLLAMA_URL}/api/generate", json=payload, timeout=120)
                response.raise_for_status()
                result = response.json().get("response", "")
                if result:
                    summaries.append(f"Page {page_idx+1}-{img_index+1}: {result}")
            except Exception as e:
                print(f"⚠️ Vision 분석 오류 (page {page_idx+1}): {e}")
    doc.close()
    return "\n".join(summaries) if summaries else ""

# ==============================
# 5️⃣ LLM 정의 (Ollama API)
# ==============================
class OllamaLLM(LLM):
    model_name: str = LLM_MODEL
    base_url: str = OLLAMA_URL
    timeout: int = 300

    def _call(self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[Any] = None) -> str:
        payload = {"model": self.model_name, "prompt": prompt, "stream": False}
        try:
            r = requests.post(f"{self.base_url}/api/generate", json=payload, timeout=self.timeout)
            r.raise_for_status()
            return r.json()["response"]
        except Exception as e:
            raise RuntimeError(f"Ollama LLM 오류: {e}")

    @property
    def _llm_type(self) -> str:
        return "ollama"

# ==============================
# 6️⃣ 프롬프트 생성
# ==============================
prompt_template = """
You are a biomedical text analysis assistant.

Your goal is to extract **only the drugs that were experimentally tested, administered, or directly used in the study**.
Focus on precision — if uncertain, do not include the drug.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Guidelines:
- Prefer drugs explicitly described as being *tested*, *treated*, *administered*, or *used* in the experiments.
- Exclude drugs mentioned only in background, discussion, or references.
- Ignore drugs that appear as examples, related compounds, or comparative mentions unless they were actually used.
- Merge WordPiece fragments into full drug names.
- Remove duplicates.
- Extract **at least 1** and **at most 3** drug names.
- If no clear experimental drugs are found, return "None".
- Output only the extracted drug names, separated by semicolons (;), with no extra text or explanation.

{vision_context}
==== End ====
Answer:
"""

PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "vision_context", "question"]
)

# ==============================
# 7️⃣ 체인 구성
# ==============================
print("[5] 체인 구성 중...")
llm = OllamaLLM()
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True,
    verbose=False
)

# ==============================
# 8️⃣ 실행
# ==============================
print("[6] Vision 모델 실행 중...")
vision_summary = extract_and_summarize_figures(pdf_path, model=VISION_MODEL)

print("[7] 질의 실행 중...")
question = "Extract generic drug names from this biomedical paper."

inputs = {
    "query": question,
    "vision_context": vision_summary if vision_summary else ""
}
response = chain(inputs)


try:
    response = chain(inputs)
    print("\n✅ 결과:")
    print(response["result"])
except Exception as e:
    print(f"⚠️ LLM 실행 오류: {e}")


[1] PDF 로드 중...
[2] 문서 분할 중...
[3] 임베딩 생성 중...
[5] 체인 구성 중...
[6] Vision 모델 실행 중...
[4] Vision 요약 중...


KeyboardInterrupt: 