In [None]:
import torch
import bs4
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_community.llms import HuggingFacePipeline
from langchain.schema import StrOutputParser
from langchain.chains import LLMChain


url = "https://n.news.naver.com/article/437/0000378416"

loader = WebBaseLoader(
    web_paths=(url,),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            "div",
            attrs={"class": ["newsct_article _article_body", "media_end_head_title"]},
        )
    ),
)
docs = loader.load()


# 단계 2: 문서 분할(Split Documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
splits = text_splitter.split_documents(docs)

# 단계 3: 임베딩 & 벡터스토어 생성(Create Vectorstore)
# BGE-M3-Korean 임베딩 모델 설정
if torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

# BGE-M3-Korean 임베딩 모델 설정
model_name = "upskyy/bge-m3-korean"
model_kwargs = {'device': device}
encode_kwargs = {'normalize_embeddings': True}

# 임베딩 모델 초기화
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# FAISS 벡터 저장소 생성
vectorstore = FAISS.from_documents(
    documents=splits,
    embedding=embeddings
)

# 단계 4: 검색(Search)
# 뉴스에 포함되어 있는 정보를 검색하고 생성합니다.
retriever = vectorstore.as_retriever()

# 단계 5: 프롬프트 템플릿 정의
template = """다음 컨텍스트를 사용하여 질문에 답변해주세요:

컨텍스트: {context}

질문: {question}

답변:"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)


# 단계 6: 언어모델 생성(Create LLM)
tokenizer = AutoTokenizer.from_pretrained("beomi/Llama-3-Open-Ko-8B")
llm = AutoModelForCausalLM.from_pretrained("beomi/Llama-3-Open-Ko-8B")

# 파이프라인 생성
pipe = pipeline(
    "text-generation",
    model=llm,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

# 단계 7: 체인 생성
llm_chain = LLMChain(
    llm=llm,
    prompt=prompt
)

# def format_docs(docs):
#     # 검색한 문서 결과를 하나의 문단으로 합쳐줍니다.
#     return "\n\n".join(doc.page_content for doc in docs)


# # 단계 7: 체인 생성(Create Chain)
# rag_chain = (
#     {"context": retriever | format_docs, "question": RunnablePassthrough()}
#     | prompt
#     | llm
#     | StrOutputParser()
# )

# 단계 8: 체인 실행(Run Chain)
# 문서에 대한 질의를 입력하고, 답변을 출력합니다.
question = "부영그룹의 출산 장려 정책에 대해 설명해주세요"
docs = retriever.get_relevant_documents(question)
context = "\n\n".join([doc.page_content for doc in docs])

response = llm_chain.run(context=context, question=question)

# 결과 출력
print(f"URL: {url}")
print(f"문서의 수: {len(docs)}")
print("===" * 20)
print(f"[HUMAN]\n{question}\n")
print(f"[AI]\n{response}")

  from .autonotebook import tqdm as notebook_tqdm
