### Installation

In [None]:
!pip install llama-index
!pip install chromadb
!pip install llama-index-vector-stores-chroma

Collecting llama-index
  Downloading llama_index-0.11.22-py3-none-any.whl.metadata (11 kB)
Collecting llama-index-agent-openai<0.4.0,>=0.3.4 (from llama-index)
  Using cached llama_index_agent_openai-0.3.4-py3-none-any.whl.metadata (728 bytes)
Collecting llama-index-cli<0.4.0,>=0.3.1 (from llama-index)
  Using cached llama_index_cli-0.3.1-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.12.0,>=0.11.22 (from llama-index)
  Downloading llama_index_core-0.11.22-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-index-embeddings-openai<0.3.0,>=0.2.4 (from llama-index)
  Using cached llama_index_embeddings_openai-0.2.5-py3-none-any.whl.metadata (686 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.3.0 (from llama-index)
  Using cached llama_index_indices_managed_llama_cloud-0.4.0-py3-none-any.whl.metadata (3.8 kB)
Collecting llama-index-legacy<0.10.0,>=0.9.48 (from llama-index)
  Downloading llama_index_legacy-0.9.48.post4-py3-none-any.whl.metadata (8.5 kB)
Coll

### API Key Setup

In [25]:
from dotenv import load_dotenv
import os
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

### 1. Indexing

In [26]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.openai import OpenAIEmbedding

embed_model = OpenAIEmbedding()

chunks = SimpleDirectoryReader('./dataset/').load_data() # 파일 path를 넣는게 아닌, 파일이 들어있는 폴더 경로
print(len(chunks))  # chunk 갯수

11


In [27]:
chunks[0]

Document(id_='0980de93-f869-43ca-9980-0a524376a5bc', embedding=None, metadata={'file_path': '/root/dev/langchain_practice/dataset/Garbage in, Garbage out.md', 'file_name': 'Garbage in, Garbage out.md', 'file_type': 'text/markdown', 'file_size': 22675, 'creation_date': '2024-10-25', 'last_modified_date': '2024-10-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\n16 Garbage in, Garbage out\n\nWe throw out so much rubbish or garbage every day from our homes, schools, shops, and offices. The grains, pulses, biscuits, milk or oil purchased in shops, are packed in plastic bags or tins. All these wrapping materials go out as garbage. We sometimes buy things that are rarely used and often thrown into the garbage.\n\nWe generate so much garbage in our day

In [28]:
import chromadb

# Chroma 클라이언트를 초기화하고 데이터를 저장할 디렉터리에 전달
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("my_chromadb")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    chunks, storage_context=storage_context, embed_model=embed_model
)


### 2. Retrieval

In [29]:
# load from disk
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("my_chromadb")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)

retriever = index.as_retriever()
retriever.retrieve("In what environments do earthworms thrive?")

[NodeWithScore(node=TextNode(id_='644f7c65-0803-4a89-9525-c2d307466009', embedding=None, metadata={'file_path': '/root/dev/langchain_practice/dataset/Garbage in, Garbage out.md', 'file_name': 'Garbage in, Garbage out.md', 'file_type': 'text/markdown', 'file_size': 22675, 'creation_date': '2024-10-25', 'last_modified_date': '2024-10-25'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b4c0211a-3241-4a11-bfba-5400f4b39989', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/root/dev/langchain_practice/dataset/Garbage in, Garbage out.md', 'file_name': 'Garbage in, Garbage out.md', 'file_type': 'text/markdown', 'file_size': 22675, 'creation_date': '2024-10-25', 'last_modified_date': '2024-10-25'}, 

### 3. Generation

In [30]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

# Query Data
query_engine = index.as_query_engine(llm=llm)
response = query_engine.query("In what environments do earthworms thrive?")
print(response)

Earthworms thrive in environments that are neither too hot nor too cold, with sufficient moisture and air. They do not survive well in very hot or very cold surroundings.


### 3. Generation - RAG를 사용하지 않고 llm만 이용하여 답변 생성한 코드

In [31]:
from llama_index.llms.openai import OpenAI

# LLM 초기화
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

# 직접 질문하기
response = llm.complete(prompt="In what environments do earthworms thrive?")
print(response.text)

Earthworms thrive in moist, well-drained soil that is rich in organic matter. They are commonly found in gardens, forests, grasslands, and agricultural fields. Earthworms prefer soil with a neutral pH and moderate temperatures. They are sensitive to extreme heat, cold, and dry conditions, so they are most active in spring and fall when the soil is moist and temperatures are moderate.


### 전체 코드

In [None]:
from llama_index.llms.openai import OpenAI # llama_index의 OpenAI 래퍼

# 디스크에서 Chromadb 불러오기
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("my_chromadb")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model,
)

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

question = "In what environments do earthworms thrive?"

# as_query_engine을 이용하여 답변 생성
query_engine = index.as_query_engine(llm=llm)
response = query_engine.query(question)
print(response)

Earthworms thrive in environments that are neither too hot nor too cold, with sufficient air and moisture. They do not survive well in very hot or very cold surroundings.


### query engine을 사용하지 않고 Open API 직접 호출

In [33]:
from llama_index.core.prompts.base import PromptTemplate
from openai import OpenAI  # OpenAI API를 직접 호출

question = "In what environments do earthworms thrive?"

# top_k = 3개의 관련성 높은 chunk만 가져오기
vector_retriever = index.as_retriever(similarity_top_k=3)
retrieved_contents = vector_retriever.retrieve(question) # question에 가장 가까운 contents 검색

retrieved_text = "\n\n".join([doc.text for doc in retrieved_contents]) # content에서 text 요소만 가져와서 이어 붙이기

template = PromptTemplate("""
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

Question: {question} 

Context: {context} 

Answer:
""")

final_prompt = template.format(context=retrieved_text, question=question)

# OpenAI client 초기화
client = OpenAI(api_key=api_key)

response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "user", "content": final_prompt}
    ],
    temperature=0.1
)

answer = str(response.choices[0].message.content)
print(answer)

Earthworms thrive in environments that are neither too hot nor too cold, with sufficient air and moisture. They prefer places that do not receive direct sunlight. Earthworms are used for vermicomposting, a method of preparing compost with the help of redworms.
