### Installation

In [None]:
!pip install chromadb
!pip install -U langsmith
!pip install -U langchain

### API Key Setup

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY") # "sk-..."

### 1. Indexing
- 문서 로드
- splitter를 이용하여 문서 청킹
- chunk를 chromaDB에 저장

In [2]:
# dataset/Garbage in, Garbage out.md 파일을 열어서 f라는 파일 객체를 생성
with open("./dataset/Garbage in, Garbage out.md") as f:
    file = f.read()  # 파일의 내용을 읽어서 file 변수에 저장

# 파일 확인하기
print(file[:500])

# 16 Garbage in, Garbage out

We throw out so much rubbish or garbage every day from our homes, schools, shops, and offices. The grains, pulses, biscuits, milk or oil purchased in shops, are packed in plastic bags or tins. All these wrapping materials go out as garbage. We sometimes buy things that are rarely used and often thrown into the garbage.

We generate so much garbage in our day-to-day activities! We often throw groundnut shells on public places, in buses or trains, after eating the nut


In [3]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_chroma import Chroma

# splitter 정의
text_splitter = SemanticChunker(OpenAIEmbeddings())

# 문서 분할
chunks = text_splitter.create_documents([file])
print(len(chunks)) # 나눠진 청크 수 확인

17


In [4]:
chunks[3]

Document(metadata={}, page_content='Take care not to use excess of water. Do not press the layer of waste. Keep this layer loose so that it has sufficient air and moisture. Now, your pit is ready to welcome the redworms. Buy some redworms and put them in your pit (Fig. 16.4).')

In [5]:
DB_PATH = "./chromaDB"

# 청크를 디스크에 저장. 저장시 persist_directory에 저장할 경로 지정
# VectorDB에 저장할 때 임베딩 모델도 지정 (이 경우는 OpenAIEmbeddings)
db = Chroma.from_documents(
    chunks, OpenAIEmbeddings(), persist_directory=DB_PATH, collection_name="my_db"
)

db.get() # 저장된 데이터 확인

{'ids': ['0fbfa3d9-261d-40ab-b662-f2067cde5587',
  '646bf738-e375-4f67-9717-0b7f67215a02',
  '5c5df815-8c23-46b7-a1d6-3b371ad9859f',
  '4a85b1dc-4636-4c7a-aeb7-c2cfd3b39443',
  '5b4eaff1-67a3-4f32-91cd-67311e0f1ed5',
  '5e6811f4-13f8-4e26-acc1-ef68d3d77ecf',
  '3dce101c-3b18-48f4-8045-34b3dfd4adae',
  'd73b62c5-c9e1-4a26-bb52-6b616f4c9eff',
  '43d352f0-2f0f-4595-a50a-4b1dca0a2832',
  '4604b863-f37e-4ed8-8662-4ce13cefa32a',
  '63d45715-e195-4055-8968-3650608e63c4',
  '74a888a8-f2e0-430f-826f-697fa93f14cb',
  'cc985636-7d0a-464f-96c7-d3b73871cd1c',
  'a25ed9e7-198d-47c5-a0a1-99a51dc38896',
  'fa2c1dfa-d7e5-40f9-bb6e-d3c258f4ec8e',
  '451b16cf-119d-4fcc-ae00-80649d8d5d7e',
  '4d43c3b9-9bae-4af8-bd96-1a4204f8898b'],
 'embeddings': None,
 'documents': ['# 16 Garbage in, Garbage out\n\nWe throw out so much rubbish or garbage every day from our homes, schools, shops, and offices. The grains, pulses, biscuits, milk or oil purchased in shops, are packed in plastic bags or tins. All these wrappi

### 2. Retrieval
- 어떤 content가 검색되었는지 확인해보기

In [6]:
question = "In what environments do earthworms thrive?"

retriever = db.as_retriever() # retrieval 진행
retriever.invoke(question) # retriever 검색기에 직접 invoke를 넣어주면 어떤 content가 검색되었는지 볼 수 있음

[Document(metadata={}, page_content='Take care not to use excess of water. Do not press the layer of waste. Keep this layer loose so that it has sufficient air and moisture. Now, your pit is ready to welcome the redworms. Buy some redworms and put them in your pit (Fig. 16.4).'),
 Document(metadata={}, page_content='It might be a good idea to bury this food about 2-3 cm inside the pit. Do not use wastes\n![Food for redworms](path/of/image)  \n*A close-up image showing various types of food suitable for redworms, including some organic waste.*\n\nthat may contain salt, pickles, oil, vinegar, meat and milk preparations as food for your redworms. If you put these things in the pit, disease-causing small organisms start growing in the pit. Once in a few days, gently mix and move the top layers of your pit. Redworms do not have teeth. They have a structure called \'gizzard\', which helps them in grinding their food. Powdered egg shells or sea shells could be mixed with the wastes. This woul

### 3. Generation
- langchain의 invoke를 이용해 간단한 쿼리에 대해 답변 생성

In [7]:
from langchain_openai import ChatOpenAI

question = "In what environments do earthworms thrive?"

# RAG를 사용하지 않고 일단 기본 llm에 쿼리만 줬을때 답변 얻는 것 확인하기
llm = ChatOpenAI(temperature=0.1, max_tokens=2048, model="gpt-3.5-turbo")

print(llm.invoke(question))

content='Earthworms thrive in moist, well-drained soil that is rich in organic matter. They prefer soil with a neutral pH level and do well in environments with plenty of decaying plant material. Earthworms are also sensitive to temperature and do best in moderate climates where the soil is not too hot or too cold. They can be found in a variety of habitats, including gardens, forests, grasslands, and agricultural fields.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 86, 'prompt_tokens': 16, 'total_tokens': 102, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-b54ea344-ab6c-471b-b613-98b30c936d4b-0' usage_metadata={'input_tokens': 16, 'output_tokens': 86, 'total_tokens': 1

### Chain으로 구성해서 invoke해보기
- chain 구성
    - retriever로 부터 검색된 청크 내용만 가져와 이어 붙여서 context에 전달
    - input으로 들어오는 쿼리를 question에 전달
    - prompt로 question과 context가 변수에 들어와 전체 prompt를 완성
    - llm에서 답변 생성
    - StrOutputParser를 통해 content만 가져오기
- invoke를 호출하여 chain 작업 시작

In [8]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_chroma import Chroma

DB_PATH = "./chromaDB"

# 디스크에서 문서를 로드
persist_db = Chroma(
    persist_directory=DB_PATH,
    embedding_function=OpenAIEmbeddings(),
    collection_name="my_db",
)

retriever = persist_db.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

question = "In what environments do earthworms thrive?"
rag_chain.invoke(question)

'(a) Garbage that may contain salt, pickles, oil, vinegar, meat, and milk preparations is not converted into compost by redworms. (b) If there are other organisms besides redworms in the pit, their names should be identified through observation and research.'