In [1]:
import gc
import torch

from pydantic import BaseModel, Field
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, HTMLHeaderTextSplitter
from langchain_core.prompts import ChatPromptTemplate

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
embeddings = HuggingFaceEmbeddings(model='BAAI/bge-m3', model_kwargs={'device':'cuda'}, encode_kwargs={'batch_size':8})

urls = [
    "https://google.github.io/styleguide/pyguide.html",
    "https://google.github.io/styleguide/javaguide.html",
    "https://google.github.io/styleguide/jsguide.html"
]

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
html_header_splits = []
for url in urls:
    splits = html_splitter.split_text_from_url(url)
    html_header_splits.extend(splits)

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)
doc_splits = text_splitter.split_documents(html_header_splits)

In [3]:
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name='rag-chroma',
    embedding=embeddings
)
retriever = vectorstore.as_retriever()

In [4]:
gc.collect()
torch.cuda.empty_cache()

In [5]:
for chunk in doc_splits[:3]: # 앞부분 3개만 확인
    print(f"Content Preview: {chunk.page_content[:50]}...")
    print(f"Metadata: {chunk.metadata}") 
    print("-" * 20)

Content Preview: AUTHORS:
Prefer only GitHub-flavored Markdown in e...
Metadata: {}
--------------------
Content Preview: Google Python Style Guide...
Metadata: {'Header 1': 'Google Python Style Guide'}
--------------------
Content Preview: Table of Contents  
1 Background  
2 Python Langua...
Metadata: {'Header 1': 'Google Python Style Guide'}
--------------------


In [8]:
class GradeDocuments(BaseModel):
    binary_score: str = Field(description='문서와 질문의 연관성 여부를 "yes" 또는 "no"로 알려주세요.')

llm_eval = ChatOpenAI(model='gpt-5-nano', temperature=0)
structured_llm_grader = llm_eval.with_structured_output(GradeDocuments)

system = '''
당신은 사용자의 질문에 대해 검색된 문서의 관련성을 평가하는 전문가입니다.
문서에 질문과 관련된 키워드나 의미가 담겨 있으면, 해당 문서를 "관련 있음"으로 평가하세요.
문서가 질문과 관련이 있는지 여부를 "yes" 또는 "no"로 표시해주세요.'''

grade_prompt = ChatPromptTemplate.from_messages(
    [('system', system), ('human', '검색된 문서: \n\n {document} \n\n 사용자 질문: {question}')]
)

retrieval_grader = grade_prompt | structured_llm_grader

In [12]:
question = '파이썬 코드 작성 가이드'

test = retriever.invoke(question)
test_txt = test[0].page_content
print(retrieval_grader.invoke({'question':question, 'document':test_txt}))

binary_score='yes'
