In [None]:
import os
import faiss
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.llms.openai import OpenAI

EMBEDDING_DIMENSIONS = {
    "sentence-transformers/all-MiniLM-L6-v2": 384,
    "sentence-transformers/all-mpnet-base-v2": 768,
    "text-embedding-3-small": 1536,
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_dir = "/Users/goeksu/dev/interactive_rag/data/landing"
persist_root = "/Users/goeksu/dev/interactive_rag/data/index"
chunk_size = 256
model_name = "sentence-transformers/all-MiniLM-L6-v2"

# File ingestion
# Simple directory reader seems to be sufficient for this task.
reader = SimpleDirectoryReader(input_dir=input_dir)
documents = reader.load_data()
# Split docs to nodes and create vector index
# Just setting a sliding window chunk here, nothing fancy.
node_parser = SimpleNodeParser.from_defaults(
    chunk_size=chunk_size, chunk_overlap=int(chunk_size * 0.2)
)
nodes = node_parser.get_nodes_from_documents(documents=documents)

: 

In [None]:
# Initialize embedding model
if model_name.startswith("text-embedding-3"):
    embedding_model = OpenAIEmbedding(model=model_name)
else:
    embedding_model = HuggingFaceEmbedding(model_name=model_name)

In [None]:
# Initialize a faiss vector store
d = EMBEDDING_DIMENSIONS[model_name]
faiss_index = faiss.IndexFlatL2(d)
vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    nodes=nodes, storage_context=storage_context, embed_model=embedding_model
)
# Save index to disk
persist_dir = os.path.join(persist_root, model_name)
index.storage_context.persist(persist_dir=persist_dir)

In [None]:
# interact with index
index.as_query_engine(llm=OpenAI(model="gpt-4o-mini"))