# Semantic Search & RAG with LlamaIndex
## ABB #5 - Session 3

Code authored by: Shaw Talebi

### imports

In [None]:
from IPython.display import display, Markdown
from bs4 import BeautifulSoup

from llama_index.core import VectorStoreIndex, get_response_synthesizer, Settings
from llama_index.core.schema import TextNode
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from typing import List, Dict, Any

In [None]:
from dotenv import load_dotenv
import os

# import sk from .env file
load_dotenv()
my_sk = os.getenv("OPENAI_API_KEY")

### 1) chunk articles

In [None]:
# Get all HTML files from raw directory
filename_list = ["articles/"+f for f in os.listdir('articles')]

chunk_list = []
for filename in filename_list:
    # only process .html files
    if filename.lower().endswith(('.html')):
        # read html file
        with open(filename, 'r', encoding='utf-8') as file:
            html_content = file.read()
    
        # Parse HTML
        soup = BeautifulSoup(html_content, 'html.parser')
        
        # Get article title
        article_title = soup.find('title').get_text().strip() if soup.find('title') else "Untitled"
        
        # Initialize variables
        article_content = []
        current_section = "Main"  # Default section if no headers found
        
        # Find all headers and text content
        content_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol'])
    
        # iterate through elements and extract text with metadata
        for element in content_elements:
            if element.name in ['h1', 'h2', 'h3']:
                current_section = element.get_text().strip()
            elif element.name in ['p', 'ul', 'ol']:
                text = element.get_text().strip()
                # Only add non-empty content that's at least 30 characters long
                if text and len(text) >= 30:
                    article_content.append({
                        'article_title': article_title,
                        'section': current_section,
                        'text': text
                    })
    
        # add article content to list
        chunk_list.extend(article_content)

In [None]:
# create nodes with Llama Index (i.e. nodes)
node_list = []
for i, chunk in enumerate(chunk_list):
    node_list.append(
        TextNode(
            id_=str(i), 
            text=chunk["text"], 
            metadata = {
                "article":chunk["article_title"],
                "section":chunk["section"]
            }
        )
    )

print(len(node_list))

### 2) create index

In [None]:
index = VectorStoreIndex(node_list)

print(f"Embedding Model: {index._embed_model.model_name}")
print(f"Index Size: {len(index.vector_store.data.embedding_dict)}")
print(f"Embedding Size: {len(index.vector_store.data.embedding_dict["0"])}")

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# changing embedding model
Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_ollama import OllamaLLM
from typing import Callable

def create_hyde_prompt(question: str) -> str:
    """
    Generates a hyDE-style prompt for a given question using LangChain's PromptTemplate.
    Pure function: no side effects.
    """
    template = (
        "You are given a user question. Carefully analyze its intent and semantic meaning. "
        "Generate a detailed, plausible answer that directly addresses the question, "
        "using relevant terminology and context. This hypothetical answer should be as informative and specific as possible, "
        "to maximize the chance of retrieving documents that truly match the user's information need.\n"
        "Question: {question}\n"
        "Hypothetical Answer:"
    )
    prompt = PromptTemplate(
        input_variables=["question"],
        template=template
    )
    return prompt.format(question=question)

def get_ollama_llm(model: str = "llama3.2:latest") -> Callable[[str], str]:
    """
    Factory function to create an OllamaLLM instance with the given model.
    Returns a function that takes a prompt and returns the LLM's response.
    """
    llm = OllamaLLM(model=model)
    def invoke(prompt: str) -> str:
        # Use the new .invoke method as per deprecation warning
        return llm.invoke(prompt)
    return invoke

def generate_hypothetical_document(question: str, model: str = "llama3.2:latest") -> str:
    """
    Uses OllamaLLM to generate a hypothetical document for the given question.
    Pure function: no side effects except for printing.
    """
    prompt = create_hyde_prompt(question)
    #print(prompt)
    ollama_invoke = get_ollama_llm(model)
    return ollama_invoke(prompt)

In [None]:
index = VectorStoreIndex(node_list)

print(f"Embedding Model: {index._embed_model.model_name}")
print(f"Index Size: {len(index.vector_store.data.embedding_dict)}")
print(f"Embedding Size: {len(index.vector_store.data.embedding_dict["0"])}")

### 3) semantic search

In [None]:
# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

In [None]:
def retrieve_with_hyde(
    query: str,
    retriever,
    hyde_llm: str = "llama3.2:latest"
) -> List[Any]:
    """
    Given a query, generate a hypothetical document using HyDE and retrieve relevant documents using the retriever.
    Returns the retrieval results.
    """
    # Step 1: Generate hypothetical document
    hypothetical_doc = generate_hypothetical_document(query, model=hyde_llm)
    #print("Hypothetical Document Generated:\n", hypothetical_doc)

    # Step 2: Retrieve relevant documents using the hypothetical document as the query
    results = retriever.retrieve(hypothetical_doc)
    return results

In [None]:
results = retriever.retrieve("When do I perform fine-tuning?")

In [None]:
def display_retrieved_results(results) -> None:
    print(results[0])
    # format results in markdown
    results_markdown = ""
    for i, result in enumerate(results, start=1):
        results_markdown += f"{i}. **Article title:** {result.metadata["article"]}  \n"
        results_markdown += f"   **Section:** {result.metadata["section"]}  \n"
        results_markdown += f"   **Snippet:** {result.text} \n\n"
        results_markdown += f"   **Score:** {result.score} \n\n"
    display(Markdown(results_markdown))

In [None]:
display_retrieved_results(results)

#### With hyde dummy document augmentation

In [None]:
results = retrieve_with_hyde("When do I perform fine-tuning?", retriever)
display_retrieved_results(results)

### 4) RAG

In [None]:
# configure response synthesizer
response_synthesizer = get_response_synthesizer()

In [None]:
# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)

In [None]:
# assemble query engine with hyde
from typing import Any, List
from llama_index.core.retrievers import BaseRetriever
from llama_index.core.schema import QueryBundle, NodeWithScore

class HydeRetriever(BaseRetriever):
    def __init__(self, retriever_func, base_retriever):
        self._retriever_func = retriever_func
        self._base_retriever = base_retriever

    def _retrieve(self, query_bundle: QueryBundle, **kwargs: Any) -> List[NodeWithScore]:
        # delegate to the provided function, which should return List[NodeWithScore]
        return self._retriever_func(query_bundle.query_str, self._base_retriever)

# Wrap retrieve_with_hyde in a class that implements .retrieve()
hyde_retriever = HydeRetriever(retrieve_with_hyde, retriever)

query_engine_hyde = RetrieverQueryEngine(
    retriever=hyde_retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)
response = query_engine_hyde.query("When do I perform fine-tuning?")
print(response)

In [None]:
response = query_engine.query("When do I perform fine-tuning?")
print(response)

In [None]:
print(f"LLM: {Settings.llm.model}")

In [None]:
from llama_index.llms.openai import OpenAI

# changing the global LLM
Settings.llm = OpenAI("gpt-4o")

In [None]:
# simpler way to make query engine
query_engine = index.as_query_engine()
response = query_engine.query("When do I perform fine-tuning?")
print(response)

In [None]:
print(f"LLM: {Settings.llm.model}")