In [1]:
# file reader resources
from llama_index.readers.file import FlatReader
from pathlib import Path

In [2]:
# parser resouces
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.schema import MetadataMode

In [None]:
# vector db resources
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext

In [4]:
# llm resources
import os
import openai
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

In [5]:
# embedding resources
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

In [19]:
# advanced metadata augmentation
from llama_index.core.extractors import SummaryExtractor
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.node_parser import SentenceSplitter

In [7]:
# custom retriever resources
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

In [8]:
# setting up llm and enbedding models:
Settings.llm = OpenAI(model="gpt-4.1-nano")
#Settings.llm = OpenAI(model="gpt-4o-mini")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

In [9]:
queries = [
    "¿Cómo es el reajuste por IPC en Fakesoft? ¿Cómo es el reajuste por IPC en Ficticia?",
    "¿Cuál es el cronograma para pagar establecido en el contrato con Fakesoft?",
    "¿Qué tipo de almacenamiento utiliza Ficticia?"
]

#### reading files from source

In [10]:
def read_files() -> list:
    base_path = "../data/"
    files = [
        "contrato_latam_fakesoft.md", 
        "contrato_latam_ficticia_anexo_b_especificaciones_tecnicas.md", 
        "contrato_latam_ficticia.md"
    ]
    md_docs = []
    for file in files:
        rel_path = f"{base_path}{file}"
        print(f"reading file {rel_path}...")
        md_docs.extend(FlatReader().load_data(Path(rel_path)))
    return md_docs

#### parsing / chunking

In [11]:
def chunk_documents(documents: list) ->list:
    """since all documents are well-structured markdown, we can use the MarkdownNodeParser
    to parse them into nodes"""
    parser = MarkdownNodeParser()
    return parser.get_nodes_from_documents(documents)

#### vector db management

In [12]:
def get_or_create_chroma_index(nodes: list, collection: str, mode: str = "load"):
    persist_dir = "../chroma_index"
    db = chromadb.PersistentClient(path=persist_dir)
    chroma_collection = db.get_or_create_collection(collection)
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = None
    if mode == "create":
        index = VectorStoreIndex(nodes=nodes, embed_model=Settings.embed_model, storage_context=storage_context)
        print(f"building vector index: {persist_dir}, collection: {collection}")
    elif mode ==  "load":
        index = VectorStoreIndex.from_vector_store(
            vector_store, storage_context=storage_context
        )
        print(f"vector index loaded from storage: {persist_dir}, collection: {collection}")
    return index

#### metadata management

In [13]:
def organize_node_structure(nodes: list) -> list:
    """By printing any node we can see that there is no clear separation betweeen what corresponds to metadata 
    and what corresponds to the text section extracted from our doc.
    We can define a template to overwrite the structure of what's going to be send to the embedding model:
    """
    updated_nodes  = []
    for node in nodes:
        # overwriting property:
        node.text_template = "METADATA:\n{metadata_str}\n---\nCONTENT:\n{content}"
        updated_nodes.append(node)
    return updated_nodes

def agument_metadata(nodes: list) -> list:
    # the following operations could have been done in one shot. I'm splitting them into multiple 
    # separated functions just to be more explicit
    nodes = company_augmentation(nodes)
    nodes = exclude_metadata(nodes)
    return nodes

def company_augmentation(nodes: list) -> list:
    # this information for source could be calculated or extracted from ingestion resources in a real-world
    # pipeline, but for simplicity I'm just mapping it.
    companies = {
        'contrato_latam_fakesoft.md': 'Fakesoft',
        'contrato_latam_ficticia_anexo_b_especificaciones_tecnicas.md': 'Ficticia',
        'contrato_latam_ficticia.md': 'Ficticia'
    }
    for node in nodes:
        filename = node.metadata["filename"]
        company = companies.get(filename)
        node.metadata["company"] = company
    return nodes

def exclude_metadata(nodes: list) -> list:
    # extension is not useful at all for this experiment. We can discard it from metadata to avoid unnecessary noise
    # and also fake relationships
    for node in nodes:
        if "extension" not in node.excluded_embed_metadata_keys:
            node.excluded_embed_metadata_keys.append("extension")
        if "extension" not in node.excluded_llm_metadata_keys:
            node.excluded_llm_metadata_keys.append("extension")
    return nodes

#### chunking & advanced metadata augmentation

In [None]:

def chunk_advanced_augmented_metadata(nodes: list) -> list:
    prompt_template = """
    Here is the content of the section:\n{context_str}\n\nSummarize the key topics and entities of the section using Spanish. \n: 
    """
    sentence_splitter = SentenceSplitter(chunk_size=200, chunk_overlap=25, paragraph_separator="\n\n")
    summary_extractor = SummaryExtractor(llm=Settings.llm, prompt_template=prompt_template)
    pipeline = IngestionPipeline(transformations=[
        sentence_splitter,
        summary_extractor,
    ])
    nodes = pipeline.run(nodes=nodes, show_progress=True)
    return nodes

#### retrieval function

In [15]:
def retrieve(index: VectorStoreIndex, query: str) -> list[str]:
    """
    I've added an additional parameter to be able to switch vector indexes just for demo purposes
    """
    query_engine = index.as_query_engine(
        model=Settings.llm, 
        # Adding metadata filters can be a great idea when multiple companies are involved in the same index. 
        # Just letting it commented out for now...
        # filters=MetadataFilters(
        # filters=[
        #     MetadataFilter(key="company", value="Ficticia", operator=FilterOperator.EQ),
        # ]
        # ),
    )
    response = query_engine.query(query)
    response_dict = {"answer": None, "relevant_nodes": None}
    if response:
        relevant_nodes = [r for r in response.source_nodes]
        response_dict["answer"] = response
        response_dict["relevant_nodes"] = relevant_nodes
    return response_dict


#### Improved retireval function with custom retriever

In [16]:
def custom_retriever(index: VectorStoreIndex, query: str) -> list[str]:
    """
    Here is a demo that shows how to implement a retriever step-by-step. It brings more granular control.
    """
    retriever = VectorIndexRetriever(
        index=index,
        # Adding metadata filters can be a great idea when multiple companies are involved in the same index. 
        # Just letting it commented out for now...
        # filters=MetadataFilters(
        # filters=[
        #     MetadataFilter(key="company", value="Ficticia", operator=FilterOperator.EQ),
        # ]
        # ),
        similarity_top_k=5,
    )
    response_synthesizer = get_response_synthesizer(
        llm=Settings.llm,
        response_mode="tree_summarize",
    )

    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        response_synthesizer=response_synthesizer,
    )

    response = query_engine.query(query)
    response_dict = {"answer": None, "relevant_nodes": None}
    if response:
        relevant_nodes = [r for r in response.source_nodes]
        response_dict["answer"] = response
        response_dict["relevant_nodes"] = relevant_nodes
    return response_dict

#### helper functions

In [17]:
def print_node(node) -> None:
    print(node.get_content(metadata_mode=MetadataMode.EMBED))

#### Main function

In [18]:
def execute_queries(index: VectorStoreIndex, queries: list, custom_retr: bool = False) -> None:
    """Main entrypoint. Calls the retrieve function for each query and prints the results."""
    for query in queries:
        print("==================================================")
        print(f"query: {query}")
        print("==================================================")
        if custom_retr:
            response = custom_retriever(index=index, query=query)
        else:
            response = retrieve(index=index, query=query)
        print("answer:")
        print(response["answer"])
        print("==================================================")
        print("relevant_nodes:")
        for r in response["relevant_nodes"]:
            print_node(r)
        print("==================================================")
        print("\n\n")