# Retrieval

### **RAG의 5단계**
1. **Document Loader**  : 문서를 불러오기
2. **Document Transformer** : 문서를 쪼개기
3. **Embedding** : 텍스트를 숫자로 바꾸기
4. **Vetor Store** : 저장소에 넣기
5. **Retrieval** : 검색해서 LLM에 전달

In [2]:
%pip install langchain-community pypdf faiss-cpu sentence-transformers

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-6.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp312-cp312-win_amd64.whl.metadata (7.6 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.2.0-py3-none-any.whl.metadata (16 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain-community)
  Downloading aiohttp-3.13.2-cp312-cp312-win_amd64.whl.metadata (8.4 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.10.1 (from langchain-community)
  Downloading pydantic_settings-2.12.0-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.3-py3-none-any.whl.metadata (9.7 kB)
Collecting aiohappyeyeballs>=2.5.0 (from aiohttp<4.0.0,>=3.8.3->la

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

## Document Loader(문서 불러오기)

In [4]:
%pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Downloading beautifulsoup4-4.14.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>=1.6.1 (from beautifulsoup4->bs4)
  Downloading soupsieve-2.8.1-py3-none-any.whl.metadata (4.6 kB)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Downloading beautifulsoup4-4.14.3-py3-none-any.whl (107 kB)
Downloading soupsieve-2.8.1-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4

   ------------- -------------------------- 1/3 [beautifulsoup4]
   ---------------------------------------- 3/3 [bs4]

Successfully installed beautifulsoup4-4.14.3 bs4-0.0.2 soupsieve-2.8.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
from langchain_community.document_loaders import WebBaseLoader  # 웹페이지 URL에서 텍스트를 긁어오는 도구

url = "https://species.wikimedia.org/wiki/%EB%8C%80%EB%AC%B8"

# 로더 인스턴스 생성
loader = WebBaseLoader(url)

# 해당 URL에 접속하여 HTML 파싱, 텍스트만 추출하여 Document 객체 리스트로 반환
documents = loader.load()

print(len(documents))
print(documents[0].metadata)

# 본문 내용 확인
print(documents[0].page_content[:500])

In [None]:
from langchain_community.document_loaders import PyPDFLoader    # PDF 파일을 로드하여 텍스트로 변환하는 도구

# 로더 인스턴스 생성 (파일 경로 지정)
loader = PyPDFLoader("The_Adventures_of_Tom_Sawyer.pdf")

# 문서 로드 실행 : PDF 각 페이지를 하나의 Document 객체로 변환하여 리스트로 반환
documents = loader.load()

print(len(documents))
print(documents[0].metadata)

print(documents[3].page_content)

## Embedding Model(임베딩 : 텍스트를 숫자로)

In [23]:
from langchain_openai import OpenAIEmbeddings
import pandas as pd

# 임베딩 모델 생성
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

text = "The quick brown fox jumps over the lazy dog"
vector = embeddings.embed_query(text)   # 하나의 문자열을 벡터로 변환 .embed_query()

print(len(vector))
print(pd.Series(vector).head())

1536
0   -0.020838
1   -0.016896
2   -0.004536
3   -0.050858
4   -0.025975
dtype: float64


In [31]:
# 문서 내용만 추출
docs = [document.page_content for document in documents]
print(len(docs))

# embed_documents() : 문자열 리스트를 받아서, 각각을 백터로 변환한 뒤 '벡터 리스트'를 반환
vects = embeddings.embed_documents(docs)

print(len(vects))
print(len(vects[0]))
pd.DataFrame(vects)

35
35
1536


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0.015368,-0.034811,-0.009329,0.014481,0.007343,0.014409,-0.052248,0.049236,-0.013593,0.015107,...,-0.008608,0.020671,0.002576,-0.002818,-0.021685,0.024887,0.02503,-0.013593,0.017691,0.019022
1,0.022011,-0.019454,0.014152,-0.002708,0.000608,-0.046838,-0.046292,0.089768,-0.024037,-0.00898,...,0.010905,0.003195,0.030516,-0.008757,-0.015445,0.05382,-0.006613,-0.019051,0.023476,-0.003281
2,-0.011807,-0.009602,0.013972,-0.021397,-0.01735,0.00583,-0.002324,0.046666,-0.026301,-0.010646,...,-0.008962,-0.025526,-0.00449,0.011479,-0.050302,0.033152,0.011408,-0.005062,0.044319,-0.007091
3,0.020616,-0.024838,0.009873,-0.010289,-0.007548,-0.001772,-0.006372,0.012802,-0.050067,-0.016342,...,0.017797,-0.027229,0.007444,-0.018057,-0.047676,0.04765,-0.022136,-0.008723,0.034893,0.00912
4,0.00683,-0.015726,0.023837,-0.000479,-0.017511,-0.030482,0.023493,0.009891,-0.029308,-0.01306,...,-0.041119,-0.024577,-0.022676,0.00741,-0.048949,0.042547,0.010675,-0.0015,0.031349,0.002658
5,0.01852,0.015934,-0.040698,0.035223,0.023566,-0.031564,0.015606,0.020437,-0.047031,0.021068,...,-0.014306,-0.028663,-0.018494,0.009172,-0.024828,0.029193,-0.00072,0.014041,0.023415,-0.016413
6,0.030211,0.047995,-0.02408,-0.001267,0.027627,0.00646,0.030587,-0.01796,-0.039984,-0.00905,...,0.034134,0.037799,-0.021073,-0.015822,-0.008299,0.002166,0.04776,-0.023034,0.033077,-0.025583
7,-0.005975,0.019737,-0.022504,0.019725,0.012417,-0.041627,0.022788,-0.013162,-0.014475,-0.003731,...,-0.017478,-0.034957,-0.031953,0.001972,-0.036258,0.023888,0.00664,0.001101,0.026797,-0.010466
8,-0.012464,-6.5e-05,-0.045397,0.002162,-0.014583,-0.043415,0.025573,-0.014893,-0.015711,-0.011671,...,-0.027382,-0.014224,-0.040045,-0.010451,-0.024198,0.036179,-0.005359,-0.004996,0.02115,0.009255
9,-0.018818,0.050801,-0.04557,0.008179,0.010877,0.004139,0.041448,0.045238,-0.009337,-0.085023,...,-0.003701,0.0126,-0.030454,-0.012312,0.020247,0.007481,0.039674,-0.032537,0.008511,0.002462
