# Retrieval

In [19]:
from dotenv import load_dotenv
load_dotenv()

True

### Document Loader

In [None]:
from langchain_community.document_loaders import WebBaseLoader

url = 'https://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EC%A0%95%EC%B1%85%EA%B3%BC_%EC%A7%80%EC%B9%A8'

loader = WebBaseLoader(url)
documents = loader.load()
print(documents[0].metadata['title'])
print(documents[0].page_content[:1000])


In [None]:
!pip install pypdf

In [17]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('./data/The_Adventures_of_Tom_Sawyer.pdf')
documents = loader.load()   # 하나의 페이지를 하나의 document객체로 가져와 리스트로 반환
print(len(documents))
print(documents[0].metadata)
print()
print(documents)

35
{'producer': '3-Heights(TM) PDF Optimization Shell 5.9.1.5 (http://www.pdf-tools.com)', 'creator': 'Acrobat PDFMaker 7.0 dla programu Word', 'creationdate': '2006-08-26T00:50:00+02:00', 'author': 'GOLDEN', 'company': 'c', 'title': 'Microsoft Word - 1', 'moddate': '2021-01-27T15:00:11+01:00', 'source': './data/The_Adventures_of_Tom_Sawyer.pdf', 'total_pages': 35, 'page': 0, 'page_label': '1'}
==
The Adventures of                 
Tom Sawyer 
 
MARK TWAIN 
Level 1 
 
Retold by Jacqueline Kehl                                                    
Series Editors: Andy Hopkins and Jocelyn Potter


# Embedding Model

In [20]:
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model='text-embedding-3-small')

text='The quick brown fox jumps over the lazy dog.'

vector = embedding_model.embed_query(text)
print(len(vector))

1536


In [21]:
docs = [document.page_content for document in documents]

vects = embedding_model.embed_documents(docs)

print(len(vects))
print(len(vects[0]))


35
1536


### Vector Store

In [24]:
from langchain.vectorstores import FAISS
vector_score = FAISS.from_documents(documents,embedding_model)

In [25]:
vector_score.similarity_search('Tom Sawyer', k=3)

[Document(id='bd7579bf-7c20-4f7f-a25f-d3f3d605d077', metadata={'producer': '3-Heights(TM) PDF Optimization Shell 5.9.1.5 (http://www.pdf-tools.com)', 'creator': 'Acrobat PDFMaker 7.0 dla programu Word', 'creationdate': '2006-08-26T00:50:00+02:00', 'author': 'GOLDEN', 'company': 'c', 'title': 'Microsoft Word - 1', 'moddate': '2021-01-27T15:00:11+01:00', 'source': './data/The_Adventures_of_Tom_Sawyer.pdf', 'total_pages': 35, 'page': 4, 'page_label': '5'}, page_content='Introduction \n \n \nOne Saturday afternoon Tom wanted to have an adventure                    \nbecause he didn’t want to think about Injun Joe. He went \nto Huck and said, “I’m going to look for treasure. Do you \nwant to come with me?” \n \nTom Sawyer loves adventures. He has a lot of adventures \nat home, at school, and with his friends. He has one \nadventure in a cave. But why is he there? What does he \nsee in the cave? And why is he afraid? \n \nMark Twain (1835-1910) is a famous American writer. \nHis name was Sam

### Retriever

In [26]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

model = ChatOpenAI(
    model='gpt-4o-mini',
    temperature=0
)

retriever = vector_score.as_retriever()
print(retriever)

retrieval_qa = RetrievalQA.from_chain_type(
    llm=model,
    retriever=retriever,
    chain_type='stuff'
)


tags=['FAISS', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000253349844F0> search_kwargs={}


In [29]:
response = retrieval_qa.invoke('마을 무덤에 있던 남자를 누가 죽였나요?')

In [30]:
response

{'query': '마을 무덤에 있던 남자를 누가 죽였나요?', 'result': '인전 조(Injun Joe)가 의사를 칼로 죽였습니다.'}