In [1]:
import torch
print(torch.__version__)  # PyTorch 버전 확인
print(torch.backends.mps.is_available())  # MPS 지원 여부 확인
print(torch.backends.mps.is_built())  # MPS가 빌드되었는지 확인

2.6.0
True
True


In [2]:
import warnings

warnings.filterwarnings('ignore')

In [None]:
from langchain_ollama import ChatOllama

# reasoning을 수행하는 llm
reasoning_llm = ChatOllama(
    model = 'deepseek-r1:7b',
    stop = ["</think>"]
)

# 답변을 생성하는 llm
answer_llm = ChatOllama(
    model = 'exaone3.5',
    temperature = 0
)

In [4]:
from typing import Annotated, List, TypedDict, Literal
from langgraph.graph.message import add_messages
from langchain_core.documents import Document

class RAGState(TypedDict):
    query: str
    thinking: str
    Document: List[Document]
    answer: str
    messages: Annotated[List, add_messages]
    mode: str

In [5]:
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType

FILE_PATH = "https://arxiv.org/pdf/2106.09685"

loader = DoclingLoader(
    file_path = FILE_PATH,
    export_type = ExportType.MARKDOWN
)

docs = loader.load()

In [6]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [
        ("#", "Header_1"),
        ("##", "Header_2"),
        ("###", "Header_3")
    ]
)

splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]

for d in splits[:3]:
    print(f"-{d.page_content=}")
print("...")

-d.page_content='Edward Hu  \n∗  \nYelong Shen  \n∗  \nPhillip Wallis Zeyuan Allen-Zhu Lu Wang Weizhu Chen  \nYuanzhi Li  \nShean Wang  \nMicrosoft Corporation  \n{ edwardhu, yeshe, phwallis, zeyuana, yuanzhil, swang, luw, wzchen @microsoft.com } yuanzhil@andrew.cmu.edu  \n(Version 2)'
-d.page_content='An important paradigm of natural language processing consists of large-scale pretraining on general domain data and adaptation to particular tasks or domains. As we pre-train larger models, full fine-tuning, which retrains all model parameters, becomes less feasible. Using GPT-3 175B as an example - deploying independent instances of fine-tuned models, each with 175B parameters, is prohibitively expensive. We propose Lo wR ank A daptation, or LoRA, which freezes the pretrained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. Compared to GPT-3 175B fine

In [13]:
from IPython.display import Markdown

display(Markdown(splits[12].page_content))

Having shown that LoRA can be a competitive alternative to full fine-tuning on NLU, we hope to answer if LoRA still prevails on NLG models, such as GPT-2 medium and large (Radford et al., b). We keep our setup as close as possible to Li &amp; Liang (2021) for a direct comparison. Due to space constraint, we only present our result on E2E NLG Challenge (Table 3) in this section. See Section F.1 for results on WebNLG (Gardent et al., 2017) and DART (Nan et al., 2020). We include a list of the hyperparameters used in Section D.3.  
Table 4: Performance of different adaptation methods on GPT-3 175B. We report the logical form validation accuracy on WikiSQL, validation accuracy on MultiNLI-matched, and Rouge-1/2/L on SAMSum. LoRA performs better than prior approaches, including full fine-tuning. The results on WikiSQL have a fluctuation around ± 0 5% . , MNLI-m around ± 0 1% . , and SAMSum around ± 0 2 . / ± 0 2 . / ± 0 1 . for the three metrics.  
| Model&Method       | # Trainable Parameters   |   WikiSQL Acc. (%) |   MNLI-m Acc. (%) | SAMSum R1/R2/RL   |
|--------------------|--------------------------|--------------------|-------------------|-------------------|
| GPT-3 (FT)         | 175,255.8M               |               73.8 |              89.5 | 52.0/28.0/44.5    |
| GPT-3 (BitFit)     | 14.2M                    |               71.3 |              91   | 51.3/27.4/43.5    |
| GPT-3 (PreEmbed)   | 3.2M                     |               63.1 |              88.6 | 48.3/24.2/40.5    |
| GPT-3 (PreLayer)   | 20.2M                    |               70.1 |              89.5 | 50.8/27.3/43.5    |
| GPT-3 (Adapter H ) | 7.1M                     |               71.9 |              89.8 | 53.0/28.9/44.8    |
| GPT-3 (Adapter H ) | 40.1M                    |               73.2 |              91.5 | 53.2/29.0/45.1    |
| GPT-3 (LoRA)       | 4.7M                     |               73.4 |              91.7 | 53.8/29.8/45.9    |
| GPT-3 (LoRA)       | 37.7M                    |               74   |              91.6 | 53.4/29.2/45.1    |

In [14]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model = "bge-m3:latest"
)

In [15]:
from langchain_qdrant import QdrantVectorStore
from langchain_qdrant import RetrievalMode

vector_store = QdrantVectorStore.from_documents(
    documents=splits,
    embedding=embeddings,
    location = ":memory:",
    collectio_name = "rag_collection_0228",
    retrieval_mode = RetrievalMode.DENSE
)

retriver = vector_store.as_retriever(search_kwargs = {'k':10})

In [16]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain_community.cross_encoders import HuggingFaceCrossEncoder

model = HuggingFaceCrossEncoder(model_name = "BAAI/bge-reranker-base")
compressor = CrossEncoderReranker(model = model, top_n= 5)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriver
)

In [17]:
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from langgraph.graph import START, StateGraph, END

def classify_node(state: RAGState):
    """질문을 분류하여 처리 모드를 결정"""
    query = state["query"]
    if "Docling" in query:
        print("====검색 시작=====")
        return {"mode": "retrieve"}
    else:
        print("=====생성 시작=====")
        return {"mode": "generate"}

def route_by_mode(state:RAGState) -> Literal["retrieve", "generate"]:
    """모드에 따라 다음 단계를 결정합니다."""
    return state["mode"]

def retrieve(state:RAGState):
    """질의를 기반으로 관련 문서를 검색"""
    query = state["query"]
    print("===검색 시작===")
    documents = compression_retriever.invoke(query)
    for doc in documents:
        print(doc.page_content)
        print("-"*100)
    print("====검색 완료====")
    return {"documents": documents}

def reasoning(state: RAGState):
    """"쿼리를 분석하고 사고 과정을 생성합니다."""
    query = state["query"]
    documents = state["documents"]
    context = "\n\n".join([doc.page_content for doc in documents])
    reasoning_prompt = ChatPromptTemplate.from_template(
        """주어진 문서를 활용하여 사용자의 질문에 가장 적절한 답변을 작성해주세요.
        
        질문: {query}
        문서 내용:
        {context}

        상세 추론:"""
    )

    reasoning_chain = reasoning_prompt | reasoning_llm | StrOutputParser()
    print("====추론 시작====")
    thinking = reasoning_chain.invoke({"query": query, "context": context})

    return {"thinkinh": thinking}

def generate(state:RAGState):
    """문서와 추론 과정을 기반으로 최종 답변을 생성"""
    query = state["query"]
    thinking = state["thinking"]
    documents = state["documents"]

    context = "\n\n".join([doc.page_content for doc in documents])

    answer_prompt = ChatPromptTemplate.from_template(
        """사용자의 질문에 한글로 답변하세요. 제공된 문서와 추론 과정이 있다면, 최대한 활용하세요.
        
        질문: 
        {query}

        추론 과정:
        {thinking}

        문서 내용:
        {context}

        답변:"""
    )
    print("=====답변 생성 완료====")
    answer_chain = answer_prompt | answer_llm | StrOutputParser
    answer = answer_chain.invoke({
        "query": query,
        "thingking": thinking,
        "context": context
    })
    return {
        "answer": answer,
        "messages": [HumanMessage(content = answer)]
    }



In [18]:
from langgraph.checkpoint.memory import MemorySaver

workflow = StateGraph(RAGState)

workflow.add_node("classify", classify_node)
workflow.add_node("reasoning", reasoning)
workflow.add_node("retrieve", retrieve)
workflow.add_node("generate", generate)

workflow.add_edge(START, "classify")
workflow.add_conditional_edges(
    "classify",
    route_by_mode,
    {"retrieve": "retrieve",
     "generate": "generate"}
) 

workflow.add_edge("retrieve", "reasoning")
workflow.add_edge("reasoning", "generate")
workflow.add_edge("generate", END)

<langgraph.graph.state.StateGraph at 0x31a3f0e90>