# **Installation & Setup**

In [1]:
# Install core packages
!pip install -qU langchain langchain-community langchain-experimental transformers sentence-transformers faiss-cpu rank_bm25 llama-cpp-python

# Install additional packages for GraphRAG and data handling
!pip install -qU networkx scipy pandas

# Download the quantized LLM model
!wget -q -O llama-2-7b-chat.Q4_0.gguf https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_0.gguf

# **1. Data Preparation & Common Setup**

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Sample financial news corpus
corpus = [
    "Apple Inc. reported record quarterly revenue of $123.9 billion for Q1 2024, driven by strong iPhone sales. Net profit was $34.6 billion.",
    "The Federal Reserve is expected to hold interest rates steady at 5.5% in its upcoming meeting, according to economists.",
    "Tesla's stock price fell by 6% after the company missed delivery estimates for Q4 2023. The company delivered 485,000 vehicles.",
    "Microsoft announced a new $50 billion stock buyback program and increased its dividend by 10%. CEO Satya Nadella praised the company's cloud growth.",
    "The global semiconductor shortage is projected to ease by mid-2024, as new manufacturing capacity comes online, says a report from Gartner.",
    "Amazon Web Services (AWS) signed a $1 billion contract with a major enterprise client, highlighting the continued growth of cloud computing.",
    "The SEC approved the first spot Bitcoin ETFs, a landmark decision for cryptocurrency integration with traditional finance.",
    "Unemployment claims in the US dropped to 210,000 last week, signaling a resilient labor market amidst economic uncertainties.",
    "Oil prices surged by 4% to $85 per barrel following geopolitical tensions in the Middle East and production cuts by OPEC+.",
    "The Bank of Japan maintained its ultra-loose monetary policy, keeping interest rates negative and yield caps in place for now."
]

# Test question
test_question = "What was Apple's quarterly revenue?"

print(f"Corpus size: {len(corpus)} documents")
print(f"Test question: {test_question}")

# Create Documents
docs = [Document(page_content=text) for text in corpus]

# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
split_docs = text_splitter.split_documents(docs)

# Initialize Embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# Create a central FAISS vector store for most architectures
vectorstore = FAISS.from_documents(split_docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3}) # Top 3 results

Corpus size: 10 documents
Test question: What was Apple's quarterly revenue?


  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Initialize the quantized LLM
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(
    model_path="llama-2-7b-chat.Q4_0.gguf",
    temperature=0.1,
    max_tokens=512,
    top_p=1,
    callback_manager=callback_manager,
    verbose=True,
    n_ctx=4096,
    n_gpu_layers=40,
    n_batch=512,
    stop=["</s>", "USER:", "ASSISTANT:"]
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from llama-2-7b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32

# **2. Define the Architectures**

In [4]:
from langchain import hub
from langchain.chains import RetrievalQA, LLMChain
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser

## **2.1. Naive RAG**

In [5]:
def naive_rag(query):
    """Standard RAG: Retrieve relevant chunks and generate an answer."""
    prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        You are a helpful, respectful, and honest financial assistant. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
        <</SYS>>
        Context: {context}
        Question: {question}
        Provide a concise and accurate answer: [/INST]""",
        input_variables=["context", "question"]
    )

    # Retrieve context
    retrieved_docs = retriever.invoke(query)
    context = "\n".join([doc.page_content for doc in retrieved_docs])

    # Generate answer
    rag_chain = prompt | llm | StrOutputParser()
    answer = rag_chain.invoke({"context": context, "question": query})
    return answer, context

## **2.2. HyDE**

In [6]:
def hyde_rag(query):
    """Hypothetical Document Embeddings: Generate a hypothetical answer first, then use it for retrieval."""
    # Step 1: Generate a hypothetical answer
    hyde_prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        You are an expert financial analyst. Write a hypothetical paragraph that answers the following question. The paragraph should be informative and contain key entities and facts.
        <</SYS>>
        Question: {question}
        Hypothetical Answer: [/INST]""",
        input_variables=["question"]
    )
    hyde_chain = hyde_prompt | llm | StrOutputParser()
    hypothetical_doc = hyde_chain.invoke({"question": query})

    # Step 2: Use the hypothetical doc for retrieval
    hypothetical_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    retrieved_docs = hypothetical_retriever.invoke(hypothetical_doc)
    context = "\n".join([doc.page_content for doc in retrieved_docs])

    # Step 3: Generate the final answer
    answer_prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        You are a helpful assistant. Use the following context to answer the question.
        <</SYS>>
        Context: {context}
        Question: {question}
        Concise Answer: [/INST]""",
        input_variables=["context", "question"]
    )
    answer_chain = answer_prompt | llm | StrOutputParser()
    answer = answer_chain.invoke({"context": context, "question": query})
    return answer, context, hypothetical_doc

## **2.3. Corrective RAG (CRAG) - Simplified**

In [7]:
def crag_rag(query):
    """Corrective RAG: Evaluate the retrieved documents before generating an answer."""
    # Retrieve context
    retrieved_docs = retriever.invoke(query)
    context_text = "\n".join([doc.page_content for doc in retrieved_docs])

    # **Simplified Corrective Step**: Use LLM to evaluate context relevance
    correction_prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        Evaluate if the following retrieved context is relevant and sufficient to answer the question. Answer ONLY 'YES' or 'NO'.
        <</SYS>>
        Question: {question}
        Context: {context}
        Evaluation: [/INST]""",
        input_variables=["context", "question"]
    )
    correction_chain = correction_prompt | llm | StrOutputParser()
    evaluation = correction_chain.invoke({"context": context_text, "question": query})

    if "YES" in evaluation.upper():
        # If context is good, proceed with standard RAG
        answer_prompt = PromptTemplate(
            template="""<s>[INST] <<SYS>>
            Answer the question using the context.
            <</SYS>>
            Context: {context}
            Question: {question}
            Answer: [/INST]""",
            input_variables=["context", "question"]
        )
        answer_chain = answer_prompt | llm | StrOutputParser()
        answer = answer_chain.invoke({"context": context_text, "question": query})
        corrective_action = "Proceeded with standard generation."
    else:
        # If context is bad, use web search (fallback) - Simplified here to just use LLM knowledge
        corrective_action = "Context deemed insufficient. Using LLM's internal knowledge (fallback mode)."
        fallback_prompt = PromptTemplate(
            template="""<s>[INST] <<SYS>>
            You are a knowledgeable assistant. Answer the question based on your own knowledge.
            <</SYS>>
            Question: {question}
            Answer: [/INST]""",
            input_variables=["question"]
        )
        fallback_chain = fallback_prompt | llm | StrOutputParser()
        answer = fallback_chain.invoke({"question": query})
        context_text = "N/A - Used internal knowledge"

    return answer, context_text, corrective_action

## **2.4. GraphRAG - Simplified (Using TextRank-like concept)**

In [8]:
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def build_graph_from_docs(documents, embeddings_model, similarity_threshold=0.7):
    """Build a simple knowledge graph from documents based on semantic similarity."""
    G = nx.Graph()
    text_list = [doc.page_content for doc in documents]
    # Add nodes
    for i, text in enumerate(text_list):
        G.add_node(i, text=text)

    # Get embeddings for all documents
    doc_embeddings = embeddings_model.embed_documents(text_list)
    doc_embeddings = np.array(doc_embeddings)

    # Add edges based on cosine similarity
    for i in range(len(doc_embeddings)):
        for j in range(i+1, len(doc_embeddings)):
            sim = cosine_similarity([doc_embeddings[i]], [doc_embeddings[j]])[0][0]
            if sim > similarity_threshold:
                G.add_edge(i, j, weight=sim)
    return G

def graphrag_rag(query, all_docs=split_docs):
    """GraphRAG: Retrieve information using a knowledge graph."""
    # Build graph (in a real scenario, this would be pre-built)
    G = build_graph_from_docs(all_docs, embeddings)

    # Embed the query
    query_embedding = embeddings.embed_query(query)
    query_embedding = np.array(query_embedding)

    # Find the most relevant node to the query
    doc_embeddings = embeddings.embed_documents([doc.page_content for doc in all_docs])
    doc_embeddings = np.array(doc_embeddings)
    similarities = cosine_similarity([query_embedding], doc_embeddings)[0]
    most_relevant_node_idx = np.argmax(similarities)

    # Get the context from the most relevant node and its neighbors
    context_nodes = [most_relevant_node_idx]
    context_nodes.extend(list(G.neighbors(most_relevant_node_idx)))
    context = "\n".join([all_docs[i].page_content for i in context_nodes])

    # Generate answer
    prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        Use the following context from a knowledge graph to answer the question.
        <</SYS>>
        Context: {context}
        Question: {question}
        Answer: [/INST]""",
        input_variables=["context", "question"]
    )
    rag_chain = prompt | llm | StrOutputParser()
    answer = rag_chain.invoke({"context": context, "question": query})
    return answer, context

## **2.5. Multimodal RAG (Basic Text + Table Understanding)**

In [9]:
table_text = "| Company | Q1 2024 Revenue | Net Profit |\n|---------|----------------|------------|\n| Apple | $123.9B | $34.6B |\n| Microsoft | $62.0B | $21.9B |\n| Tesla | $25.1B | $2.7B |"
multimodal_corpus = corpus + [table_text]
multimodal_docs = [Document(page_content=text) for text in multimodal_corpus]
multimodal_split_docs = text_splitter.split_documents(multimodal_docs)
multimodal_vectorstore = FAISS.from_documents(multimodal_split_docs, embeddings)
multimodal_retriever = multimodal_vectorstore.as_retriever(search_kwargs={"k": 4}) # Retrieve more chunks

def multimodal_rag(query):
    """Multimodal RAG: Handles both textual and tabular data."""
    retrieved_docs = multimodal_retriever.invoke(query)
    context = "\n".join([doc.page_content for doc in retrieved_docs])

    prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        You are an assistant skilled in analyzing both text and tables. Use the following context, which may contain text paragraphs or markdown tables, to answer the question.
        <</SYS>>
        Context: {context}
        Question: {question}
        Provide a precise answer: [/INST]""",
        input_variables=["context", "question"]
    )
    rag_chain = prompt | llm | StrOutputParser()
    answer = rag_chain.invoke({"context": context, "question": query})
    return answer, context

## **2.6. Hybrid RAG (Dense + Sparse Retrieval)**

In [10]:
from rank_bm25 import BM25Okapi
import numpy as np

class HybridRetriever:
    def __init__(self, vector_retriever, text_list):
        self.vector_retriever = vector_retriever
        self.bm25 = BM25Okapi([text.split() for text in text_list])

    def retrieve(self, query, k=3):
        # Dense Retrieval
        dense_docs = self.vector_retriever.invoke(query)
        dense_context = "\n".join([doc.page_content for doc in dense_docs])

        # Sparse Retrieval (BM25)
        tokenized_query = query.split()
        bm25_scores = self.bm25.get_scores(tokenized_query)
        top_bm25_indices = np.argsort(bm25_scores)[::-1][:k]
        sparse_context = "\n".join([corpus[i] for i in top_bm25_indices])

        # Combine contexts
        hybrid_context = f"[DENSE RETRIEVAL RESULTS]:\n{dense_context}\n\n[SPARSE RETRIEVAL RESULTS]:\n{sparse_context}"
        return hybrid_context

# Initialize
text_list_for_bm25 = [doc.page_content for doc in split_docs]
hybrid_retriever_obj = HybridRetriever(retriever, text_list_for_bm25)

def hybrid_rag(query):
    """Hybrid RAG: Combines dense (vector) and sparse (BM25) retrieval."""
    context = hybrid_retriever_obj.retrieve(query)

    prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        Use the following context from two different retrieval methods to answer the question.
        <</SYS>>
        Context: {context}
        Question: {question}
        Answer: [/INST]""",
        input_variables=["context", "question"]
    )
    rag_chain = prompt | llm | StrOutputParser()
    answer = rag_chain.invoke({"context": context, "question": query})
    return answer, context

## **2.7. Adaptive RAG**

In [11]:
def adaptive_rag(query):
    """Adaptive RAG: Decides whether to retrieve or not based on the query."""
    # Decision Prompt
    decision_prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        Decide if the following question requires retrieving external information from a database of financial news to be answered accurately. Answer ONLY 'YES' or 'NO'.
        <</SYS>>
        Question: {question}
        Decision: [/INST]""",
        input_variables=["question"]
    )
    decision_chain = decision_prompt | llm | StrOutputParser()
    decision = decision_chain.invoke({"question": query})

    if "YES" in decision.upper():
        # Use Naive RAG
        retrieved_docs = retriever.invoke(query)
        context = "\n".join([doc.page_content for doc in retrieved_docs])
        action = "Decision: Retrieval used."
    else:
        # Answer from internal knowledge
        context = "N/A"
        action = "Decision: No retrieval needed."

    # Generate answer (use the same prompt for both for fairness)
    answer_prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        You are a helpful assistant. Use the context if provided to answer the question. If no context is provided, use your own knowledge.
        <</SYS>>
        Context: {context}
        Question: {question}
        Answer: [/INST]""",
        input_variables=["context", "question"]
    )
    answer_chain = answer_prompt | llm | StrOutputParser()
    answer = answer_chain.invoke({"context": context, "question": query})
    return answer, context, action

## **2.8. Agentic RAG - Simplified (Single Agent with Plan-and-Act)**

In [12]:
def agentic_rag(query):
    """Agentic RAG: Uses a planning step to break down the query before retrieval and generation."""
    # Planning Step
    planner_prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        You are a query planner. Break down the following complex question into 2-3 simpler, more specific sub-questions that would help retrieve relevant information. List them one per line.
        <</SYS>>
        Complex Question: {question}
        Sub-questions: [/INST]""",
        input_variables=["question"]
    )
    planner_chain = planner_prompt | llm | StrOutputParser()
    sub_questions_text = planner_chain.invoke({"question": query})
    sub_questions = [q.strip() for q in sub_questions_text.split('\n') if q.strip()]

    # Act Step: Retrieve for each sub-question
    collected_context = ""
    for sub_q in sub_questions:
        retrieved_docs = retriever.invoke(sub_q)
        context_for_sub_q = "\n".join([doc.page_content for doc in retrieved_docs])
        collected_context += f"\n\n# Information relevant to sub-question: '{sub_q}'\n{context_for_sub_q}"

    # Final Answer Generation
    synthesizer_prompt = PromptTemplate(
        template="""<s>[INST] <<SYS>>
        You are a synthesis agent. You have broken down a complex question into parts and retrieved information for each part. Synthesize all the information below into a comprehensive and accurate final answer for the original question.
        <</SYS>>
        Original Question: {question}
        Retrieved Information for all sub-questions: {context}
        Comprehensive Final Answer: [/INST]""",
        input_variables=["context", "question"]
    )
    synthesizer_chain = synthesizer_prompt | llm | StrOutputParser()
    final_answer = synthesizer_chain.invoke({"context": collected_context, "question": query})
    return final_answer, collected_context, sub_questions

# **3. Run the Comparison**

In [13]:
import time
from tabulate import tabulate
import pandas as pd

# Initialize results list for single question
results = []

# Function to run a single architecture and store results
def run_architecture(name, func, has_extra_info=False):
    print(f"\n--- Running {name} ---")
    start_time = time.time()

    try:
        if has_extra_info:
            answer, context, extra_info = func(test_question)
        else:
            answer, context = func(test_question)
            extra_info = "N/A"
    except Exception as e:
        answer = f"Error: {str(e)}"
        context = "N/A"
        extra_info = "N/A"

    end_time = time.time()
    elapsed_time = round(end_time - start_time, 2)

    results.append({
        'Architecture': name,
        'Answer': answer,
        'Context Used': context[:500] + "..." if len(context) > 500 else context, # Truncate long context
        'Extra Info': extra_info,
        'Time (s)': elapsed_time
    })

    print(f"Completed in {elapsed_time}s")

# Run all architectures
run_architecture("Naive RAG", naive_rag)
run_architecture("HyDE", hyde_rag, has_extra_info=True)
run_architecture("Corrective RAG", crag_rag, has_extra_info=True)
run_architecture("GraphRAG", graphrag_rag)
run_architecture("Multimodal RAG", multimodal_rag)
run_architecture("Hybrid RAG", hybrid_rag)
run_architecture("Adaptive RAG", adaptive_rag, has_extra_info=True)
run_architecture("Agentic RAG", agentic_rag, has_extra_info=True)

# Create DataFrame
results_df = pd.DataFrame(results)


--- Running Naive RAG ---




  Thank you for the question! Based on the context provided, Apple's quarterly revenue for Q1 2024 was $123.9 billion.

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   48668.76 ms /   206 tokens (  236.26 ms per token,     4.23 tokens per second)
llama_perf_context_print:        eval time =   22030.54 ms /    37 runs   (  595.42 ms per token,     1.68 tokens per second)
llama_perf_context_print:       total time =   70782.41 ms /   243 tokens
llama_perf_context_print:    graphs reused =         35
Llama.generate: 13 prefix-match hit, remaining 64 prompt tokens to eval


Completed in 70.83s

--- Running HyDE ---
  As an expert financial analyst, I can tell you that Apple's quarterly revenue for the most recent quarter (Q4 of fiscal year 2023) was $71.5 billion. This represents a 12% increase from the same quarter the previous year, driven by strong demand for the company's iPhones, Macs, and other products. The revenue figure includes both hardware and services revenue, with the iPhone segment generating $34.6 billion in revenue, up 10% from the same period last year. The Mac segment contributed $7.2 billion in revenue, a 25% increase from the same quarter last year. Other products, including Apple Watch, AirPods, and HomePod, generated $2.3 billion in revenue, a 46% increase from the same quarter last year. Services revenue, which includes the App Store, Apple Music, and Apple TV+, reached $10.8 billion, a 15% increase from the same quarter last year. Overall, Apple's strong performance in Q4 reflects the company's continued ability to innovate and de

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   12876.67 ms /    64 tokens (  201.20 ms per token,     4.97 tokens per second)
llama_perf_context_print:        eval time =  154608.80 ms /   263 runs   (  587.87 ms per token,     1.70 tokens per second)
llama_perf_context_print:       total time =  168124.96 ms /   327 tokens
llama_perf_context_print:    graphs reused =        254
Llama.generate: 13 prefix-match hit, remaining 169 prompt tokens to eval


  According to the context provided, Apple's quarterly revenue for Q1 2024 was $123.9 billion.

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   30692.37 ms /   169 tokens (  181.61 ms per token,     5.51 tokens per second)
llama_perf_context_print:        eval time =   18038.74 ms /    31 runs   (  581.89 ms per token,     1.72 tokens per second)
llama_perf_context_print:       total time =   48801.39 ms /   200 tokens
llama_perf_context_print:    graphs reused =         29
Llama.generate: 11 prefix-match hit, remaining 174 prompt tokens to eval


Completed in 217.2s

--- Running Corrective RAG ---
  YES

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   33012.83 ms /   174 tokens (  189.73 ms per token,     5.27 tokens per second)
llama_perf_context_print:        eval time =    1145.98 ms /     2 runs   (  572.99 ms per token,     1.75 tokens per second)
llama_perf_context_print:       total time =   34163.44 ms /   176 tokens
llama_perf_context_print:    graphs reused =          1
Llama.generate: 10 prefix-match hit, remaining 153 prompt tokens to eval


  According to the context, Apple's quarterly revenue for Q1 2024 was $123.9 billion.

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   28548.03 ms /   153 tokens (  186.59 ms per token,     5.36 tokens per second)
llama_perf_context_print:        eval time =   17905.34 ms /    30 runs   (  596.84 ms per token,     1.68 tokens per second)
llama_perf_context_print:       total time =   46531.61 ms /   183 tokens
llama_perf_context_print:    graphs reused =         28


Completed in 80.73s

--- Running GraphRAG ---


Llama.generate: 10 prefix-match hit, remaining 89 prompt tokens to eval


  Based on the context provided, Apple's quarterly revenue for Q1 2024 was $123.9 billion.

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   16709.00 ms /    89 tokens (  187.74 ms per token,     5.33 tokens per second)
llama_perf_context_print:        eval time =   17879.16 ms /    31 runs   (  576.75 ms per token,     1.73 tokens per second)
llama_perf_context_print:       total time =   34657.11 ms /   120 tokens
llama_perf_context_print:    graphs reused =         29
Llama.generate: 11 prefix-match hit, remaining 267 prompt tokens to eval


Completed in 34.89s

--- Running Multimodal RAG ---
  Based on the context provided, Apple's quarterly revenue for Q1 2024 was $123.9 billion.

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   49345.97 ms /   267 tokens (  184.82 ms per token,     5.41 tokens per second)
llama_perf_context_print:        eval time =   18588.62 ms /    31 runs   (  599.63 ms per token,     1.67 tokens per second)
llama_perf_context_print:       total time =   68012.98 ms /   298 tokens
llama_perf_context_print:    graphs reused =         29
Llama.generate: 11 prefix-match hit, remaining 301 prompt tokens to eval


Completed in 68.05s

--- Running Hybrid RAG ---
  Based on the context provided, Apple's quarterly revenue was $123.9 billion for Q1 2024.

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   54536.01 ms /   301 tokens (  181.18 ms per token,     5.52 tokens per second)
llama_perf_context_print:        eval time =   18560.43 ms /    31 runs   (  598.72 ms per token,     1.67 tokens per second)
llama_perf_context_print:       total time =   73168.50 ms /   332 tokens
llama_perf_context_print:    graphs reused =         29
Llama.generate: 11 prefix-match hit, remaining 63 prompt tokens to eval


Completed in 73.2s

--- Running Adaptive RAG ---
  YES

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   12062.90 ms /    63 tokens (  191.47 ms per token,     5.22 tokens per second)
llama_perf_context_print:        eval time =    1468.92 ms /     2 runs   (  734.46 ms per token,     1.36 tokens per second)
llama_perf_context_print:       total time =   13539.30 ms /    65 tokens
llama_perf_context_print:    graphs reused =          1
Llama.generate: 11 prefix-match hit, remaining 172 prompt tokens to eval


  Based on the context provided, Apple's quarterly revenue for Q1 2024 was $123.9 billion.

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   30715.00 ms /   172 tokens (  178.58 ms per token,     5.60 tokens per second)
llama_perf_context_print:        eval time =   18038.60 ms /    31 runs   (  581.89 ms per token,     1.72 tokens per second)
llama_perf_context_print:       total time =   48816.73 ms /   203 tokens
llama_perf_context_print:    graphs reused =         29
Llama.generate: 14 prefix-match hit, remaining 67 prompt tokens to eval


Completed in 62.4s

--- Running Agentic RAG ---
  Sure! Here are 3 simpler sub-questions that can help retrieve relevant information about Apple's quarterly revenue:
1. What was Apple's total revenue for the most recent completed quarter?
2. How did Apple's quarterly revenue compare to the same quarter in the previous year?
3. Which product categories or segments of Apple's business generated the highest revenue during the most recent quarter?

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =   12780.28 ms /    67 tokens (  190.75 ms per token,     5.24 tokens per second)
llama_perf_context_print:        eval time =   52848.29 ms /    90 runs   (  587.20 ms per token,     1.70 tokens per second)
llama_perf_context_print:       total time =   65834.35 ms /   157 tokens
llama_perf_context_print:    graphs reused =         86
Llama.generate: 14 prefix-match hit, remaining 679 prompt tokens to eval


  Based on the information retrieved, Apple's quarterly revenue for Q1 2024 was $123.9 billion, driven by strong iPhone sales. This represents a record quarterly revenue for Apple and an increase of 31% compared to the same quarter in the previous year.
In terms of product categories or segments, iPhone sales generated the highest revenue during the most recent quarter, followed by Services and Mac. The company's net profit was $34.6 billion, an increase of 37% compared to the same quarter in the previous year.
Microsoft also had a strong quarter, with a new $50 billion stock buyback program and an increased dividend by 10%. CEO Satya Nadella praised the company's cloud growth, highlighting the continued success of its cloud computing segment.
Finally, Amazon Web Services (AWS) signed a $1 billion contract with a major enterprise client, further solidifying its position as a leader in the cloud computing market.
In summary, Apple and Microsoft had strong quarters, driven by their respe

llama_perf_context_print:        load time =   48669.46 ms
llama_perf_context_print: prompt eval time =  125177.57 ms /   679 tokens (  184.36 ms per token,     5.42 tokens per second)
llama_perf_context_print:        eval time =  162158.74 ms /   256 runs   (  633.43 ms per token,     1.58 tokens per second)
llama_perf_context_print:       total time =  287965.67 ms /   935 tokens
llama_perf_context_print:    graphs reused =        247


Completed in 353.93s


# **4. Display Results**

In [14]:
# Display the Results
print("\n" + "="*100)
print("COMPARATIVE RESULTS")
print("="*100)
print(f"Question: {test_question}")
print("="*100)

# Display in a clean table format
print(tabulate(results_df[['Architecture', 'Answer', 'Time (s)']], headers='keys', tablefmt='grid', showindex=False))

# Display detailed context and extra info
print("\n\n" + "="*100)
print("DETAILED CONTEXT AND EXTRA INFORMATION")
print("="*100)

for _, row in results_df.iterrows():
    print(f"\n--- {row['Architecture']} ---")
    print(f"Context: {row['Context Used']}")
    if row['Extra Info'] != "N/A":
        print(f"Extra Info: {row['Extra Info']}")
    print()


COMPARATIVE RESULTS
Question: What was Apple's quarterly revenue?
+----------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+
| Architecture   | Answer                                                                                                                                                                                                                                                                 |   Time (s) |
| Naive RAG      | Thank you for the question! Based on the context provided, Apple's quarterly revenue for Q1 2024 was $123.9 billion.                                                                                                                                                   |      70.83 |
+----------------+------------------------