# 인덱스 만들기
- Case 1: Document Object로 바로 Index화 시키기
- Case 2: Node Object 단위로 Index화 시키기

In [1]:
# Manual하게 Document 오브젝트 생성하기
from llama_index.core import Document, VectorStoreIndex
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
import nest_asyncio
nest_asyncio.apply()

Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-small"
)


In [2]:
# Document Object로 바로 Index화 시키기
# 기준 데이터셋 로드
from llama_index.core import VectorStoreIndex
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_dir="./data/source_files").load_data()


In [3]:
# 너무 크니까 두개만 넣어보기
documents[:2]

 Document(id_='b73fb57a-d5b9-450c-b51f-2a49e143eefc', embedding=None, metadata={'file_path': '/Users/hyeonjinho/Desktop/dev/Learned/llm/fastcampus-llamaindex-rag-design/2/data/source_files/-7473333379586622353.html', 'file_name': '-7473333379586622353.html', 'file_type': 'text/html', 'file_size': 228954, 'creation_date': '2025-04-27', 'last_modified_date': '2025-04-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='<!doctype html><html lang=en-US class=gnt__njs data-u-s=anon><head><meta charset=utf-8 /><meta name=viewport content="width=device-width,initial-scale=1,minimum-scale=1"/><meta name=theme-color content=#0098FE /><title>Your c

In [5]:
# 다큐먼트 오브젝트 바로 인덱스화
default_index = VectorStoreIndex.from_documents(documents[:2])

- IngestionPipeline으로 노드오브젝트 커스터마이즈
- 커스텀 노드를 벡터인덱스화

In [6]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
# 청크사이즈 2000으로 바꿔보기,LLM-Powered 라마인덱스의 타이틀익스트렉터 사용해보기
pipeline = IngestionPipeline(
  transformations=[
    SentenceSplitter(chunk_size=2000),
    TitleExtractor(llm=OpenAI(model="gpt-4o-mini")),
    OpenAIEmbedding(model="text-embedding-3-small")
  ]
)



In [7]:
documents[:2]

 Document(id_='b73fb57a-d5b9-450c-b51f-2a49e143eefc', embedding=None, metadata={'file_path': '/Users/hyeonjinho/Desktop/dev/Learned/llm/fastcampus-llamaindex-rag-design/2/data/source_files/-7473333379586622353.html', 'file_name': '-7473333379586622353.html', 'file_type': 'text/html', 'file_size': 228954, 'creation_date': '2025-04-27', 'last_modified_date': '2025-04-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='<!doctype html><html lang=en-US class=gnt__njs data-u-s=anon><head><meta charset=utf-8 /><meta name=viewport content="width=device-width,initial-scale=1,minimum-scale=1"/><meta name=theme-color content=#0098FE /><title>Your c

In [8]:
# 만든 파이프라인에 다큐먼트 흘려보내서 노드화 시키기
nodes = pipeline.run(documents=documents[:2])

100%|██████████| 5/5 [00:01<00:00,  2.87it/s]
100%|██████████| 5/5 [00:01<00:00,  4.28it/s]


In [9]:
# 메타데이터 확인
nodes[100].metadata

{'file_path': '/Users/hyeonjinho/Desktop/dev/Learned/llm/fastcampus-llamaindex-rag-design/2/data/source_files/-7473333379586622353.html',
 'file_name': '-7473333379586622353.html',
 'file_type': 'text/html',
 'file_size': 228954,
 'creation_date': '2025-04-27',
 'last_modified_date': '2025-04-27',
 'document_title': '"Navigating Complex Challenges: A Comprehensive Guide to Health, Safety, and Security in the Context of COVID-19 and Beyond"'}

In [10]:
# 만든 노드 벡터스토어화 시키기
node_index = VectorStoreIndex(nodes)

In [11]:
# 확인
node_index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x34ad51a90>

# VectorstoreIndex 에 써드파티 VectorDB Backend 엔진으로 사용하기 (Qdrant)

In [16]:
from llama_index.core.indices.vector_store.base import VectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore

import qdrant_client
from qdrant_client import models
client = qdrant_client.QdrantClient(
    url="", 
    api_key="",
)


In [None]:
nodes

In [None]:
documents

In [17]:
# 백엔드 연결을 위한 스토리지컨텍스트 임포트
from llama_index.core import StorageContext

# 쿼드란트 벡터스토어 만들기
vector_store = QdrantVectorStore(client=client, collection_name="corona")

# 스토리지 컨텍스트로써 쿼드란트 벡터스토어 연결시키기
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# 벡터스토어 인덱스의 백엔드로 최종 연결성 맺기
index = VectorStoreIndex.from_documents(
  documents[:2],
  storage_context=storage_context
)



# 생성된 Index 활용해서 Retriever 생성하기

In [18]:
retriever = index.as_retriever()
nodes = retriever.retrieve("what is corona?")

In [19]:
# 생성된 NodeWithScore 객체 확인
nodes

[NodeWithScore(node=TextNode(id_='25fa3619-046c-4416-91ce-69f9b1d647a9', embedding=None, metadata={'file_path': '/Users/hyeonjinho/Desktop/dev/Learned/llm/fastcampus-llamaindex-rag-design/2/data/source_files/-6401693851386763087.html', 'file_name': '-6401693851386763087.html', 'file_type': 'text/html', 'file_size': 336173, 'creation_date': '2025-04-27', 'last_modified_date': '2025-04-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ca26fb13-dfaa-4d69-9161-ebcde46c0cec', node_type='4', metadata={'file_path': '/Users/hyeonjinho/Desktop/dev/Learned/llm/fastcampus-llamaindex-rag-design/2/data/source_files/-6401693851386763087.html', 'file_name': '-6401693851386763087.html', 'file_type': 'text/html', 'file_

In [20]:
# 다이렉트하게 쿼리 엔진으로 묶어서 보기
query_engine = index.as_query_engine()
response = query_engine.query("what is corona?")

In [21]:
response

Response(response='Corona is a highly contagious and deadly virus that can cause a respiratory illness known as Covid-19.', source_nodes=[NodeWithScore(node=TextNode(id_='25fa3619-046c-4416-91ce-69f9b1d647a9', embedding=None, metadata={'file_path': '/Users/hyeonjinho/Desktop/dev/Learned/llm/fastcampus-llamaindex-rag-design/2/data/source_files/-6401693851386763087.html', 'file_name': '-6401693851386763087.html', 'file_type': 'text/html', 'file_size': 336173, 'creation_date': '2025-04-27', 'last_modified_date': '2025-04-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='ca26fb13-dfaa-4d69-9161-ebcde46c0cec', node_type='4', metadata={'file_path': '/Users/hyeonjinho/Desktop/dev/Learned/llm/fastcampus-llamain