![alt text](HyDe.svg)

![alt text](hyde-advantages.svg)

In [1]:
import os
import sys
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional, Tuple
import json

load_dotenv()

True

In [2]:
os.chdir(r"C:\Users\TempAccess\Documents\Dhruv\RAG")
print(os.getcwd())

C:\Users\TempAccess\Documents\Dhruv\RAG


In [3]:
from helper_function_openai import (
    Document,
    RetrievalResult,
    OpenAIEmbedder,
    FAISSVectorStore,
    OpenAIChat,
    read_pdf,
    chunk_text,
    show_context,
)

print("Helpers imported")

Helpers imported


# HyDE Retriever Class

```
__init__:
  1. read_pdf() → raw text
  2. chunk_text() → chunks
  3. OpenAIEmbedder.embed_documents() → embeddings
  4. FAISSVectorStore.add_documents() → indexed

retrieve(query):
  1. LLM generates hypothetical document from query
  2. Embed the hypothetical document (not the query!)
  3. Search FAISS with that embedding
  4. Return matched docs + hypothetical doc
```

In [12]:
class HyDERetriever:
    """
    Hypothetical Document Embedding retriever.
    
    Instead of embedding the query directly, generates a hypothetical answer
    document and uses THAT embedding to search the vector store.
    
    """

    def __init__(
        self,
        file_path:str,
        chunk_size:int=2000,
        chunk_overlap:int=300,
        embedding_model:str="text-embedding-3-small",
        llm_model:str="gpt-4o-mini"
    ):
        self.chunk_size=chunk_size
        self.chunk_overlap=chunk_overlap

        self.llm = OpenAIChat(
            model_name=llm_model,
            temperature=0.0,
            max_tokens=5000
        )

        self.embedder = OpenAIEmbedder(
            model = embedding_model
        )

        self.vector_store = self._build_index(file_path)

    
    def _build_index(self, file_path:str):
        """
        Read PDF, chunk, embed, and index into FAISS.
        """

        text = read_pdf(file_path=file_path)
        chunks = chunk_text(text=text, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

        documents = [
            Document(
                content=chunk,
                metadata={
                    "source": file_path,
                    "chunk_id": i
                }
            )
            for i, chunk in enumerate(chunks)
        ]

        documents = self.embedder.embed_documents(documents=documents)

        vector_store = FAISSVectorStore(dimension=self.embedder.dimension)

        vector_store.add_documents(documents=documents)
        return vector_store

    def generate_hypothetical_answer(self, query:str) -> str:
        """
        Generate a hypothetical answer document for the given query.
        """

        messages = [
            {
                "role":"system",
                "content": (
                    "You are an expert at generating detailed, in-depth documents "
                    "that directly answer questions. Generate a document that would "
                    "be found in a knowledge base as the perfect answer."
                )
            },
            {
                "role":"user",
                "content":(
                    f"Given the question '{query}', generate a hypothetical document "
                    f"that directly answers this question. The document should be "
                    f"detailed and in-depth. The document size should be exactly "
                    f"{self.chunk_size} characters."
                )
            }
        ]

        return self.llm.chat(messages=messages)

    def retrieve(self, query:str, k:int=3) -> Tuple[List[RetrievalResult], str]:
        """
        Retrieve documents using HyDE technique.
        
        Steps:
            1. Generate hypothetical document from query
            2. Embed the hypothetical document (NOT the original query)
            3. Search FAISS with that embedding
        """
        hypothetical_doc = self.generate_hypothetical_answer(query=query)
        hyde_embedding = self.embedder.embed_text(hypothetical_doc)
        results = self.vector_store.search(hyde_embedding, k=k)

        return results, hypothetical_doc

# Create Retriever Instance

In [13]:
path = r"C:\Users\TempAccess\Documents\Dhruv\RAG\data\Understanding_Climate_Change.pdf"
path

'C:\\Users\\TempAccess\\Documents\\Dhruv\\RAG\\data\\Understanding_Climate_Change.pdf'

In [14]:
retriever = HyDERetriever(path)

# Demonstrate on a Use Case

In [15]:
test_query = "What is the main cause of climate change??"

results, hypothetical_docs = retriever.retrieve(query=test_query, k=3)

print(results)
print(hypothetical_docs)

[RetrievalResult(document=Document(content='Understanding Climate Change \nChapter 1: Introduction to Climate Change \nClimate change refers to significant, long-term changes in the global climate. The term \n"global climate" encompasses the planet\'s overall weather patterns, including temperature, \nprecipitation, and wind patterns, over an extended period. Over the past century, human \nactivities, particularly the burning of fossil fuels and deforestation, have significantly \ncontributed to climate change. \nHistorical Context \nThe Earth\'s climate has changed throughout history. Over the past 650,000 years, there have \nbeen seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about \n11,700 years ago marking the beginning of the modern climate era and human civilization. \nMost of these climate changes are attributed to very small variations in Earth\'s orbit that \nchange the amount of solar energy our planet receives. During the Holocene epoch,

In [20]:
results[0].document.content

'Understanding Climate Change \nChapter 1: Introduction to Climate Change \nClimate change refers to significant, long-term changes in the global climate. The term \n"global climate" encompasses the planet\'s overall weather patterns, including temperature, \nprecipitation, and wind patterns, over an extended period. Over the past century, human \nactivities, particularly the burning of fossil fuels and deforestation, have significantly \ncontributed to climate change. \nHistorical Context \nThe Earth\'s climate has changed throughout history. Over the past 650,000 years, there have \nbeen seven cycles of glacial advance and retreat, with the abrupt end of the last ice age about \n11,700 years ago marking the beginning of the modern climate era and human civilization. \nMost of these climate changes are attributed to very small variations in Earth\'s orbit that \nchange the amount of solar energy our planet receives. During the Holocene epoch, which \nbegan at the end of the last ice a

In [21]:
results[1].document.content

'mate Change \nGreenhouse Gases \nThe primary cause of recent climate change is the increase in greenhouse gases in the \natmosphere. Greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous \noxide (N2O), trap heat from the sun, creating a "greenhouse effect." This effect is essential for life on Earth, as it keeps the planet warm enough to support life. However, human \nactivities have intensified this natural process, leading to a warmer climate. \nFossil Fuels \nBurning fossil fuels for energy releases large amounts of CO2. This includes coal, oil, and \nnatural gas used for electricity, heating, and transportation. The industrial revolution marked \nthe beginning of a significant increase in fossil fuel consumption, which continues to rise \ntoday. \nCoal \nCoal is the most carbon-intensive fossil fuel, and its use for electricity generation is a major \nsource of CO2 emissions. Despite a decline in some regions, coal remains a significant \nenergy source globall

In [22]:
results[2].document.content

"arbon sinks, absorbing CO2 from the atmosphere. When trees are cut down \nfor timber or to clear land for agriculture, this stored carbon is released back into the \natmosphere. Deforestation reduces the number of trees that can absorb CO2, exacerbating the \ngreenhouse effect. \nTropical Deforestation Tropical rainforests are particularly important for carbon storage. Deforestation in the \nAmazon, Congo Basin, and Southeast Asia has significant impacts on global carbon cycles \nand biodiversity. These regions are often cleared for agriculture, logging, and mining, leading \nto habitat loss and species extinction. \nBoreal Forests \nBoreal forests, found in the northern regions of North America, Europe, and Asia, also play a \ncrucial role in sequestering carbon. Logging and land-use changes in these regions contribute \nto climate change. These forests are vital for regulating the Earth's climate and supporting \nindigenous communities and wildlife. \nAgriculture \nAgriculture contr

# Compare: HyDE vs Standard Retrieval

In [29]:
def compare_hyde_vs_standard_retrieval(retriever:HyDERetriever, query:str, k:int=3):
    """
    Compare HyDE retrieval with standard query embedding retrieval.
    """
    hyde_results, hypothetical_docs = retriever.retrieve(query, k=k)

    query_embedding = retriever.embedder.embed_text(query)
    standard_results = retriever.vector_store.search(query_embeddings=query_embedding, k=k)

    print(f"Query: {query}")
    print(f"\n{'='*80}")
    
    print(f"\n--- HyDE Retrieval (via hypothetical doc) ---")
    print(f"Hypothetical doc: {hypothetical_docs[:150]}...\n")
    for r in hyde_results:
        print(f"  [Score: {r.score:.4f}] {r.document.content[:120]}...")
    
    print(f"\n--- Standard Retrieval (direct query embedding) ---")
    for r in standard_results:
        print(f"  [Score: {r.score:.4f}] {r.document.content[:120]}...")
    
    # Compare overlap
    hyde_chunks = {r.document.metadata.get('chunk_id') for r in hyde_results}
    std_chunks = {r.document.metadata.get('chunk_id') for r in standard_results}
    overlap = hyde_chunks & std_chunks
    
    print(f"\n--- Overlap ---")
    print(f"  HyDE chunks:     {sorted(hyde_chunks)}")
    print(f"  Standard chunks: {sorted(std_chunks)}")
    print(f"  Shared:          {sorted(overlap)} ({len(overlap)}/{k})")

In [30]:
compare_hyde_vs_standard_retrieval(retriever, "What is the main cause of climate change?")

Query: What is the main cause of climate change?


--- HyDE Retrieval (via hypothetical doc) ---
Hypothetical doc: **Title: Understanding the Main Cause of Climate Change**

**Introduction**

Climate change refers to significant alterations in global temperatures a...

  [Score: 0.7375] Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term...
  [Score: 0.6714] mate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
at...
  [Score: 0.6573] arbon sinks, absorbing CO2 from the atmosphere. When trees are cut down 
for timber or to clear land for agriculture, th...

--- Standard Retrieval (direct query embedding) ---
  [Score: 0.6494] Understanding Climate Change 
Chapter 1: Introduction to Climate Change 
Climate change refers to significant, long-term...
  [Score: 0.5405] mate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase i

In [32]:
compare_hyde_vs_standard_retrieval(retriever, "How does deforestation affect global warming?")

Query: How does deforestation affect global warming?


--- HyDE Retrieval (via hypothetical doc) ---
Hypothetical doc: **Title: The Impact of Deforestation on Global Warming**

**Introduction**

Deforestation, the large-scale removal of forests, significantly contribut...

  [Score: 0.6409] arbon sinks, absorbing CO2 from the atmosphere. When trees are cut down 
for timber or to clear land for agriculture, th...
  [Score: 0.5962] s using energy-efficient appliances, improving insulation, and 
developing more fuel-efficient vehicles. 
Building Effic...
  [Score: 0.5855] mate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increase in greenhouse gases in the 
at...

--- Standard Retrieval (direct query embedding) ---
  [Score: 0.5818] arbon sinks, absorbing CO2 from the atmosphere. When trees are cut down 
for timber or to clear land for agriculture, th...
  [Score: 0.5362] mate Change 
Greenhouse Gases 
The primary cause of recent climate change is the increa

In [33]:
compare_hyde_vs_standard_retrieval(retriever, "What are the economic impacts of rising sea levels?")

Query: What are the economic impacts of rising sea levels?


--- HyDE Retrieval (via hypothetical doc) ---
Hypothetical doc: **Title: Economic Impacts of Rising Sea Levels**

**Introduction**
Rising sea levels, primarily driven by climate change and the melting of polar ice ...

  [Score: 0.6461] Erosion 
Rising sea levels and increased storm surges are accelerating coastal erosion, threatening 
homes, infrastructu...
  [Score: 0.6111] can mitigate these emissions. The 
development of eco-friendly fertilizers and farming techniques is essential for reduc...
  [Score: 0.5625] ls, with efforts to limit the increase to 1.5 degrees Celsius. 
Countries submit nationally determined contributions (ND...

--- Standard Retrieval (direct query embedding) ---
  [Score: 0.5213] Erosion 
Rising sea levels and increased storm surges are accelerating coastal erosion, threatening 
homes, infrastructu...
  [Score: 0.4723] can mitigate these emissions. The 
development of eco-friendly fertilizers and fa