## env 로드

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

## state 정의

In [None]:
from langgraph.graph import MessagesState
from typing_extensions import Any

class State(MessagesState):
    question: str  # 사용자 질문
    dataset: Any   # 임의의 데이터셋(추후엔 DB에서 가져오거나 해야함)
    status: bool   # (라우터) ex. db에서 해당 정보를 찾을 수 있는지 없는지
    result: str    # DB에서 받은 결과
    answer: str    # 최종 답변```

## llm, Search 클래스(TypedDict)

In [None]:
from langchain_core.documents import Document
from typing_extensions import TypedDict, List
from typing import Literal  # 말그대로
from typing_extensions import Annotated  # 할말이 좀 더 있다
# LLM
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model='gpt-4.1', temperature=0)

# State를 더 빡빡하게 정의하기 위해, 위에 따로 정의한 클래스 Search

class Search(TypedDict):  # StructuredOutput 에서 사용하기 위함.
    """Vectorstore Search Query"""
    # 1. 타입, 2. ... -> NOT NULL, 3. 설명(AI용)
    query: Annotated[str, ..., 'Search query to run']
    section: Annotated[
        Literal[work, leave, equipment, welfare, etc],
        ..., 
        'Section to query'
    ]


class MyState(TypedDict):
    question: str
    query: Search
    context: List[Document]
    answer: str

## analyze_query 노드

In [None]:
def analyze_query(state: MyState):
    # Search 클래스에 맞춰 사용자 question 을 {query, section}로 바꿈
    s_llm = llm.with_structured_output(Search)
    query = s_llm.invoke(state['question'])
    return {'query': query}

## DB(문서) 불러오기 → 임베딩 → Pinecone 색인 파이프라인

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from bs4.filter import SoupStrainer  # pip install beautifulsoup4
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_community.document_loaders import TextLoader

loader = TextLoader("데이터셋.md", encoding="utf-8")  # 또는 autodetect_encoding=True
docs = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splitted_docs = splitter.split_documents(docs)

for i, doc in enumerate(splitted_docs):
    content = doc.page_content.lower()
    
    if any(keyword in content for keyword in ["근무", "출근", "업무", "근로"]):
        doc.metadata['category'] = 'work'
    elif any(keyword in content for keyword in ["휴가", "연차", "병가", "휴직"]):
        doc.metadata['category'] = 'leave'
    elif any(keyword in content for keyword in ["장비", "보안", "시설", "컴퓨터"]):
        doc.metadata['category'] = 'equipment'
    elif any(keyword in content for keyword in ["복지", "혜택", "지원", "보험"]):
        doc.metadata['category'] = 'welfare' 
    else:
        doc.metadata['category'] = 'etc'

embedding = OpenAIEmbeddings(model='text-embedding-3-small')  # small <-> large

index_name = 'gaida 회사 내규'

# 1회 실행하면, 실제 데이터가 들어가서 영구 저장 됨.
vectorstore = PineconeVectorStore.from_documents(
    splitted_docs, 
    index_name=index_name, 
    embedding=embedding
)



## retrieve 노드

In [None]:
def retrieve(query: str):
    """Retrieve information related to a query
    Args:
        query : Query to search
    """
    # 원본 Document list (artifact)
    docs = vectorstore.similarity_search(query, k=3)
    # 편집한 텍스트 (content)
    result_text = '\n\n'.join(
        (f'Source: {doc.metadata}\nContent: {doc.page_content}')
        for doc in docs
    )
    return result_text, docs