RAG-LangChain llm

In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.embeddings import OllamaEmbeddings
from langchain.chains import RetrievalQA

# 단계 1: 문서 로드(Load Documents)
loader = PyMuPDFLoader("/data1/workspace/pdfs/0008-5472_can-12-4183v1.pdf")
docs = loader.load()

# 단계 2: 문서 분할(Split Documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

# 단계 3: 임베딩(Embedding) 생성
# embeddings = OpenAIEmbeddings()
embeddings = OllamaEmbeddings(
    model="nomic-embed-text",
    base_url="http://localhost:11434"  # 기본 Ollama 서버 주소
)

# 단계 4: DB 생성(Create DB) 및 저장
# 벡터스토어를 생성합니다.
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)

# 단계 5: 검색기(Retriever) 생성
# 문서에 포함되어 있는 정보를 검색하고 생성합니다.
retriever = vectorstore.as_retriever()

# 단계 6: 프롬프트 생성(Create Prompt)
# 프롬프트를 생성합니다.



In [2]:
from langchain.llms.base import LLM
from typing import Optional, List, Any
import subprocess

class OllamaLLM(LLM):
    model_name: str = "gpt-oss"
    temperature: float = 0.0

    def _call(
        self, 
        prompt: str, 
        stop: Optional[List[str]] = None,
        run_manager: Optional[Any] = None
    ) -> str:
        result = subprocess.run(
            ['ollama', 'run', self.model_name, prompt],
            capture_output=True, 
            text=True
        )
        
        if result.returncode != 0:
            raise RuntimeError(f"Ollama LLM failed: {result.stderr}")
        return result.stdout.strip()

    @property
    def _identifying_params(self):
        return {"model_name": self.model_name, "temperature": self.temperature}

    @property
    def _llm_type(self):
        return "ollama"
    
# 단계 7: 언어모델(LLM) 생성
# 모델(LLM) 을 생성합니다.
llm = OllamaLLM(model_name="gpt-oss", temperature=0)

# 단계 8: 체인(Chain) 생성
from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff"  # 또는 "map_reduce", "refine", "map_rerank"
)

In [3]:
import requests
from typing import Optional, List, Any
from langchain.llms.base import LLM

class OllamaLLM(LLM):
    model_name: str = "gpt-oss"
    base_url: str = "http://localhost:11434"
    timeout: int = 300  # 13GB 모델이라 넉넉히 설정
    
    def _call(
        self, 
        prompt: str, 
        stop: Optional[List[str]] = None,
        run_manager: Optional[Any] = None
    ) -> str:
        url = f"{self.base_url}/api/generate"
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False
        }
        
        try:
            response = requests.post(url, json=payload, timeout=self.timeout)
            response.raise_for_status()
            return response.json()['response']
        except requests.Timeout:
            raise RuntimeError(f"Ollama timed out after {self.timeout}s")
        except Exception as e:
            raise RuntimeError(f"Ollama API error: {str(e)}")
    
    @property
    def _llm_type(self) -> str:
        return "ollama"

# 사용
llm = OllamaLLM(model_name="gpt-oss")

In [4]:
# # 1. LLM 직접 테스트
llm = OllamaLLM(model_name="gpt-oss")
print(llm("What is 2+2?"))

# 2. 체인 실행
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    verbose=True
)

response = chain({"query": "논문에서 주된 신약명은?"})
print(response['result'])

  print(llm("What is 2+2?"))
  response = chain({"query": "논문에서 주된 신약명은?"})
Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")


4

[1m> Finished chain.[0m
논문에서 주된 신약명은 **스위티니부(Sunitinib)** 입니다.


In [5]:
from langchain.prompts import PromptTemplate

# 커스텀 프롬프트 정의
prompt_template = """
You are a biomedical literature analysis assistant specialized in pharmacology and oncology.

==== Document Excerpt Start ====
{context}
==== Document Excerpt End ====

Task:
Extract all **drug names** (both generic and brand names) that are experimentally **tested, administered, or evaluated** in the study.

Follow these expert-level rules:

1. Include drugs explicitly mentioned in the **Results** or **Methods** sections as being:
   - administered, treated, exposed, injected, or tested,  
   - or associated with measurable outcomes such as tumor volume, cancer size, toxicity, response rate, or doses tested.
2. Include drugs that appear in **figures or tables**, even if not mentioned in the main text.
   - e.g., drugs listed in figure captions, chart labels, or legends.
3. Exclude drugs that appear **only** in the 'References' section.
4. Include both **generic names** (e.g., cisplatin) and **brand names** (e.g., Platinol).
5. Exclude gene names, protein names, molecular targets, and pathway components (e.g., EGFR, HER2, mTOR).
6. Use logical reasoning to determine whether each drug was **actually used or tested** in the described experiments — not just mentioned conceptually or mechanistically.
7. Extract only unique names (no duplicates).

---

Think step by step:
1. Identify all candidate drug names (generic or brand).  
2. Check whether the text or figures indicate that these drugs were **experimentally used, tested, or administered**.  
3. Exclude any names mentioned solely in the References or in mechanistic/pathway discussions.  
4. Combine both text-based and figure-based mentions into one list.  
5. Output the final unique list of drug names that were experimentally involved in the study.

Output format:
List all unique drug names separated by semicolons (;), with no explanation or numbering.

Example output:
cisplatin; afatinib; Gilotrif; erlotinib

Question: {question}

Answer:
"""


PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# RetrievalQA에 적용
chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": PROMPT},
    return_source_documents=True,
    verbose=True
)

# 실행
question = "Extract generic drug names from this paper"
response = chain({"query": question})
print(response['result'])

Error in StdOutCallbackHandler.on_chain_start callback: AttributeError("'NoneType' object has no attribute 'get'")



[1m> Finished chain.[0m
sunitinib; pazopanib; DC101; paclitaxel; cyclophosphamide; tegafur; uracil; 5-fluorouracil
