Hypothetical Prompt Embeddings (HyPE)

Key components :

*   PDF processing and text extraction

*   Text chunking to maintain coherent information units

*   Hypothetical Prompt Embedding Generation using an LLM to create multiple proxy questions per chunk
*  Vector store creation using FAISS and Gemini embeddings


*   Vector store creation using FAISS and OpenAI embeddings
Retriever setup for querying the processed documents


*   Evaluation of the RAG system


In [None]:
!pip install faiss-cpu futures langchain-community tqdm langchain_google_genai

Installing collected packages: futures, faiss-cpu
Successfully installed faiss-cpu-1.11.0 futures-3.0.5


In [None]:
import os
import sys
import faiss
from tqdm import tqdm
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from langchain_community.docstore.in_memory import InMemoryDocstore
from google.colab import userdata
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from typing import List
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.vectorstores import FAISS
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
PATH = '/content/GYM.pdf'
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 50
def generate_hypothetical_embeddings(chunk_text : str):
  model = ChatGoogleGenerativeAI(model="gemini-1.5-flash",google_api_key=GOOGLE_API_KEY,
                             temperature=0,convert_system_message_to_human=True)
  embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY)
  template = """Analyze the input text and generate essential questions that, when answered, \
        capture the main points of the text. Each question should be one line, \
        without numbering or prefixes.\n\n \
        Text:\n{chunk_text}\n\nQuestions:\n """
  question_gen_prompt = ChatPromptTemplate.from_template(template)
  question_chain = question_gen_prompt | model | StrOutputParser()
  questions = question_chain.invoke({"chunk_text": chunk_text})
  return chunk_text, embeddings.embed_documents(questions)

def prepare_vectorstore(chunks : List[str]):
  vector_store = None

  with ThreadPoolExecutor() as pool:
      futures = [pool.submit(generate_hypothetical_embeddings, c) for c in chunks]

      # Process embeddings as they complete
      for f in tqdm(as_completed(futures), total=len(chunks)):

          chunk, vectors = f.result()  # Retrieve the processed chunk and its embeddings

          if vector_store == None:
              vector_store = FAISS(
                  embedding_function=GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY),  # Define embedding model
                  index=faiss.IndexFlatL2(len(vectors[0])),  # Define an L2 index for similarity search
                  docstore=InMemoryDocstore(),  # Use in-memory document storage
                  index_to_docstore_id={}  # Maintain index-to-document mapping
              )

          # Pair the chunk's content with each generated embedding vector.
          # Each chunk is inserted multiple times, once for each prompt vector
          chunks_with_embedding_vectors = [(chunk.page_content, vec) for vec in vectors]

          # Add embeddings to the store
          vector_store.add_embeddings(chunks_with_embedding_vectors)

  return vector_store  # Return the populated vector store

def encode_pdf(path, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a PDF book into a vector store using OpenAI embeddings.

    Args:
        path: The path to the PDF file.
        chunk_size: The desired size of each text chunk.
        chunk_overlap: The amount of overlap between consecutive chunks.

    Returns:
        A FAISS vector store containing the encoded book content.
    """

    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    cleaned_texts = [
    Document(page_content=chunk.page_content.replace('t', ' '), metadata=chunk.metadata)
    for chunk in texts]
    cleaned_texts = texts

    vectorstore = prepare_vectorstore(cleaned_texts)

    return vectorstore
chunks_vector_store = encode_pdf(PATH, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks_query_retriever = chunks_vector_store.as_retriever(search_kwargs={"k": 3})
test_query = "What are the exercises on push day"
context = chunks_query_retriever.get_relevant_documents(test_query)
unique_context = []
seen = set()

for doc in context:
    if doc.page_content not in seen:
        seen.add(doc.page_content)
        unique_context.append(doc)


100%|██████████| 3/3 [00:03<00:00,  1.23s/it]


[Document(id='c2b7f076-bad3-4485-ab8b-5508e45dd48b', metadata={}, page_content='• Russian Twists – 3x30\n• Mountain Climbers – 3x30 sec\n6 Day 5 - Pull (Back + Arms Burnout)\n• Lat Pulldown – 4x12\n• Dumbbell Shrugs – 3x15\n• Dumbbell Preacher Curl (if bench available) – 3x12\n• Seated Rows or T-Bar Rows – 3x12\n• Rope Hammer Curl (if cable available) – 3x15\n7 Day 6 - Push (Chest & Arms Emphasis)\n• Flat Dumbbell Press – 4x10\n• Dumbbell Lateral Raise – 3x15\n• Dumbbell Skull Crushers – 3x12\n• Pushups – 3xAMRAP\n• Chest Dips (if possible) or Incline Pushups – 3x10\n8 Day 7 - Core & Abs\n• Hanging Leg Raises – 3x15\n• Bicycle Crunches – 3x30\n• Decline Situps or Weighted Crunches – 3x15\n• Ab Rollouts (if available) – 3x12\n• Plank Variations – 3 sets\n3')]


In [None]:
for i, doc in enumerate(unique_context):
    print(f"\n--- Document {i+1} ---")
    print(doc.page_content[:500])


--- Document 1 ---
• Russian Twists – 3x30
• Mountain Climbers – 3x30 sec
6 Day 5 - Pull (Back + Arms Burnout)
• Lat Pulldown – 4x12
• Dumbbell Shrugs – 3x15
• Dumbbell Preacher Curl (if bench available) – 3x12
• Seated Rows or T-Bar Rows – 3x12
• Rope Hammer Curl (if cable available) – 3x15
7 Day 6 - Push (Chest & Arms Emphasis)
• Flat Dumbbell Press – 4x10
• Dumbbell Lateral Raise – 3x15
• Dumbbell Skull Crushers – 3x12
• Pushups – 3xAMRAP
• Chest Dips (if possible) or Incline Pushups – 3x10
8 Day 7 - Core & Abs


In [None]:
def evaluate_rag(retriever, questions_and_answers):
    correct = 0
    for question, expected_answer in questions_and_answers:
        context = retriever.invoke(question)
        combined_context = " ".join([doc.page_content for doc in context])
        print(f"\nQ: {question}\nContext:\n{combined_context}\nExpected Answer: {expected_answer}")
evaluate_rag(chunks_query_retriever, [("What are the exercises on push day?", "Expected answer here")])



Q: What are the exercises on push day?
Context:
• Russian Twists – 3x30
• Mountain Climbers – 3x30 sec
6 Day 5 - Pull (Back + Arms Burnout)
• Lat Pulldown – 4x12
• Dumbbell Shrugs – 3x15
• Dumbbell Preacher Curl (if bench available) – 3x12
• Seated Rows or T-Bar Rows – 3x12
• Rope Hammer Curl (if cable available) – 3x15
7 Day 6 - Push (Chest & Arms Emphasis)
• Flat Dumbbell Press – 4x10
• Dumbbell Lateral Raise – 3x15
• Dumbbell Skull Crushers – 3x12
• Pushups – 3xAMRAP
• Chest Dips (if possible) or Incline Pushups – 3x10
8 Day 7 - Core & Abs
• Hanging Leg Raises – 3x15
• Bicycle Crunches – 3x30
• Decline Situps or Weighted Crunches – 3x15
• Ab Rollouts (if available) – 3x12
• Plank Variations – 3 sets
3 • Russian Twists – 3x30
• Mountain Climbers – 3x30 sec
6 Day 5 - Pull (Back + Arms Burnout)
• Lat Pulldown – 4x12
• Dumbbell Shrugs – 3x15
• Dumbbell Preacher Curl (if bench available) – 3x12
• Seated Rows or T-Bar Rows – 3x12
• Rope Hammer Curl (if cable available) – 3x15
7 Day 6 - P

Adaptive RAG:

This RAG model is modified in its process of retrieving data from the document corpus. Instead of passing the query verbatim and retrieving data according to it we classify our queries into 4 categories and according to these categories run the retrieval process:

*   Factual : For these types of queries,the query is enhanced using an LLM and used to retrieve documents using this more precise query.

*   Analytical: For these types of queries, LLM create sub topics and retrieve documents for all these sub topics and provide it which helps the LLM generate a more detailed and in depth response
*   Opinion: LLMs are used to identify different view points for the query and retrieve documents for each view point allowing the LLM to provide a complete and diverse response.


*   Contextual: Incorporates context provided by the user and uses this contextual query to retrieve data based on it.



In [None]:
import os
import sys
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import PromptTemplate
from google.colab import userdata
from langchain_core.retrievers import BaseRetriever
from typing import Dict, Any,List
from langchain.docstore.document import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_core.pydantic_v1 import BaseModel, Field
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
class categories_options(BaseModel):
        category: str = Field(description="The category of the query, the options are: Factual, Analytical, Opinion, or Contextual", example="Factual")


class QueryClassifier:
    def __init__(self):
        self.llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-flash", max_tokens=4000, google_api_key=GOOGLE_API_KEY)
        self.prompt = PromptTemplate(
            input_variables=["query"],
            template="Classify the following query into one of these categories: Factual, Analytical, Opinion, or Contextual.\nQuery: {query}\nCategory:"
        )
        self.chain = self.prompt | self.llm.with_structured_output(categories_options)


    def classify(self, query):
        print("clasiffying query")
        return self.chain.invoke(query).category
class BaseRetrievalStrategy:
    def __init__(self, texts):
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=GOOGLE_API_KEY)
        text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=0)
        self.documents = text_splitter.create_documents(texts)
        self.db = FAISS.from_documents(self.documents, self.embeddings)
        self.llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-flash", max_tokens=4000, google_api_key=GOOGLE_API_KEY)

    def retrieve(self, query, k=4):
        return self.db.similarity_search(query, k=k)
class relevant_score(BaseModel):
        score: float = Field(description="The relevance score of the document to the query", example=8.0)

class FactualRetrievalStrategy(BaseRetrievalStrategy):
    def retrieve(self, query, k=4):
        print("retrieving factual")
        # Use LLM to enhance the query
        enhanced_query_prompt = PromptTemplate(
            input_variables=["query"],
            template="Enhance this factual query for better information retrieval: {query}"
        )
        query_chain = enhanced_query_prompt | self.llm
        enhanced_query = query_chain.invoke(query).content
        print(f'enhanced query: {enhanced_query}')

        # Retrieve documents using the enhanced query
        docs = self.db.similarity_search(enhanced_query, k=k*2)

        # Use LLM to rank the relevance of retrieved documents
        ranking_prompt = PromptTemplate(
            input_variables=["query", "doc"],
            template="On a scale of 1-10, how relevant is this document to the query: '{query}'?\nDocument: {doc}\nRelevance score:"
        )
        ranking_chain = ranking_prompt | self.llm.with_structured_output(relevant_score)

        ranked_docs = []
        print("ranking docs")
        for doc in docs:
            input_data = {"query": enhanced_query, "doc": doc.page_content}
            score = float(ranking_chain.invoke(input_data).score)
            ranked_docs.append((doc, score))

        # Sort by relevance score and return top k
        ranked_docs.sort(key=lambda x: x[1], reverse=True)
        return [doc for doc, _ in ranked_docs[:k]]
class SelectedIndices(BaseModel):
    indices: List[int] = Field(description="Indices of selected documents", example=[0, 1, 2, 3])

class SubQueries(BaseModel):
    sub_queries: List[str] = Field(description="List of sub-queries for comprehensive analysis", example=["What is the population of New York?", "What is the GDP of New York?"])

class AnalyticalRetrievalStrategy(BaseRetrievalStrategy):
    def retrieve(self, query, k=4):
        print("retrieving analytical")
        # Use LLM to generate sub-queries for comprehensive analysis
        sub_queries_prompt = PromptTemplate(
            input_variables=["query", "k"],
            template="Generate {k} sub-questions for: {query}"
        )

        llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-flash", max_tokens=4000, google_api_key=GOOGLE_API_KEY)
        sub_queries_chain = sub_queries_prompt | llm.with_structured_output(SubQueries)

        input_data = {"query": query, "k": k}
        sub_queries = sub_queries_chain.invoke(input_data).sub_queries
        print(f'sub queries for comprehensive analysis: {sub_queries}')

        all_docs = []
        for sub_query in sub_queries:
            all_docs.extend(self.db.similarity_search(sub_query, k=2))

        # Use LLM to ensure diversity and relevance
        diversity_prompt = PromptTemplate(
            input_variables=["query", "docs", "k"],
            template="""Select the most diverse and relevant set of {k} documents for the query: '{query}'\nDocuments: {docs}\n
            Return only the indices of selected documents as a list of integers."""
        )
        diversity_chain = diversity_prompt | self.llm.with_structured_output(SelectedIndices)
        docs_text = "\n".join([f"{i}: {doc.page_content[:50]}..." for i, doc in enumerate(all_docs)])
        input_data = {"query": query, "docs": docs_text, "k": k}
        selected_indices_result = diversity_chain.invoke(input_data).indices
        print(f'selected diverse and relevant documents')

        return [all_docs[i] for i in selected_indices_result if i < len(all_docs)]
class OpinionRetrievalStrategy(BaseRetrievalStrategy):
    def retrieve(self, query, k=3):
        print("retrieving opinion")
        # Use LLM to identify potential viewpoints
        viewpoints_prompt = PromptTemplate(
            input_variables=["query", "k"],
            template="Identify {k} distinct viewpoints or perspectives on the topic: {query}"
        )
        viewpoints_chain = viewpoints_prompt | self.llm
        input_data = {"query": query, "k": k}
        viewpoints = viewpoints_chain.invoke(input_data).content.split('\n')
        print(f'viewpoints: {viewpoints}')

        all_docs = []
        for viewpoint in viewpoints:
            all_docs.extend(self.db.similarity_search(f"{query} {viewpoint}", k=2))

        # Use LLM to classify and select diverse opinions
        opinion_prompt = PromptTemplate(
            input_variables=["query", "docs", "k"],
            template="Classify these documents into distinct opinions on '{query}' and select the {k} most representative and diverse viewpoints:\nDocuments: {docs}\nSelected indices:"
        )
        opinion_chain = opinion_prompt | self.llm.with_structured_output(SelectedIndices)

        docs_text = "\n".join([f"{i}: {doc.page_content[:100]}..." for i, doc in enumerate(all_docs)])
        input_data = {"query": query, "docs": docs_text, "k": k}
        selected_indices = opinion_chain.invoke(input_data).indices
        print(f'selected diverse and relevant documents')

        return [all_docs[int(i)] for i in selected_indices.split() if i.isdigit() and int(i) < len(all_docs)]
class ContextualRetrievalStrategy(BaseRetrievalStrategy):
    def retrieve(self, query, k=4, user_context=None):
        print("retrieving contextual")
        # Use LLM to incorporate user context into the query
        context_prompt = PromptTemplate(
            input_variables=["query", "context"],
            template="Given the user context: {context}\nReformulate the query to best address the user's needs: {query}"
        )
        context_chain = context_prompt | self.llm
        input_data = {"query": query, "context": user_context or "No specific context provided"}
        contextualized_query = context_chain.invoke(input_data).content
        print(f'contextualized query: {contextualized_query}')

        # Retrieve documents using the contextualized query
        docs = self.db.similarity_search(contextualized_query, k=k*2)

        # Use LLM to rank the relevance of retrieved documents considering the user context
        ranking_prompt = PromptTemplate(
            input_variables=["query", "context", "doc"],
            template="Given the query: '{query}' and user context: '{context}', rate the relevance of this document on a scale of 1-10:\nDocument: {doc}\nRelevance score:"
        )
        ranking_chain = ranking_prompt | self.llm.with_structured_output(relevant_score)
        print("ranking docs")

        ranked_docs = []
        for doc in docs:
            input_data = {"query": contextualized_query, "context": user_context or "No specific context provided", "doc": doc.page_content}
            score = float(ranking_chain.invoke(input_data).score)
            ranked_docs.append((doc, score))


        # Sort by relevance score and return top k
        ranked_docs.sort(key=lambda x: x[1], reverse=True)

        return [doc for doc, _ in ranked_docs[:k]]
class AdaptiveRetriever:
    def __init__(self, texts: List[str]):
        self.classifier = QueryClassifier()
        self.strategies = {
            "Factual": FactualRetrievalStrategy(texts),
            "Analytical": AnalyticalRetrievalStrategy(texts),
            "Opinion": OpinionRetrievalStrategy(texts),
            "Contextual": ContextualRetrievalStrategy(texts)
        }

    def get_relevant_documents(self, query: str) -> List[Document]:
        category = self.classifier.classify(query)
        strategy = self.strategies[category]
        return strategy.retrieve(query)
class PydanticAdaptiveRetriever(BaseRetriever):
    adaptive_retriever: AdaptiveRetriever = Field(exclude=True)

    class Config:
        arbitrary_types_allowed = True

    def get_relevant_documents(self, query: str) -> List[Document]:
        return self.adaptive_retriever.get_relevant_documents(query)

    async def aget_relevant_documents(self, query: str) -> List[Document]:
        return self.get_relevant_documents(query)
class AdaptiveRAG:
    def __init__(self, texts: List[str]):
        adaptive_retriever = AdaptiveRetriever(texts)
        self.retriever = PydanticAdaptiveRetriever(adaptive_retriever=adaptive_retriever)
        self.llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-1.5-flash", max_tokens=4000, google_api_key=GOOGLE_API_KEY)

        # Create a custom prompt
        prompt_template = """Use the following pieces of context to answer the question at the end.
        If you don't know the answer, just say that you don't know, don't try to make up an answer.

        {context}

        Question: {question}
        Answer:"""
        prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

        # Create the LLM chain
        self.llm_chain = prompt | self.llm



    def answer(self, query: str) -> str:
        docs = self.retriever.get_relevant_documents(query)
        input_data = {"context": "\n".join([doc.page_content for doc in docs]), "question": query}
        return self.llm_chain.invoke(input_data)
texts = [
    "The Earth orbits the Sun at an average distance of about 149.6 million kilometers (1 AU). Its orbit is nearly circular, taking about 365.25 days to complete, defining a year. The Earth's axis is tilted at 23.5°, causing seasons as sunlight distribution changes. When the Northern Hemisphere tilts toward the Sun, it’s summer there and winter in the Southern Hemisphere. Equinoxes (March 21, Sept 23) have equal day and night; solstices (June 21, Dec 21) mark extremes. Earth rotates in about 24 hours, creating day and night as the Sun appears to rise in the east. Solar energy from the Sun drives Earth’s weather, photosynthesis, and ecosystems. The Sun’s light is filtered by Earth’s atmosphere, which also protects against harmful radiation. Solar activity like flares and CMEs can affect satellites and power grids. Earth’s magnetic field shields us from solar winds, contributing to auroras. The Sun’s gravity holds Earth in orbit, while Earth exerts a small pull in return. Earth is the third planet in the solar system and lies in the habitable zone. The Sun is a G-type main-sequence star, about 4.6 billion years old, made mostly of hydrogen and helium. Aphelion (farthest point) is ~152.1 million km in July; perihelion (closest) is ~147.1 million km in January. Seasons depend more on axial tilt than distance. The Sun's energy varies by latitude due to Earth's curvature. Sunspots are cooler, dark solar regions that follow an 11-year cycle. Solar eclipses occur when the Moon blocks the Sun, made possible by their apparent similar sizes. The Sun’s radiation sustains life and drives the climate. Earth’s orbital dynamics and tilt shape day length and weather. The Earth-Sun interaction influences everything from agriculture to calendars. Ancient cultures used solar movement for timekeeping and alignment of structures. Without the Sun’s gravity and energy, Earth would be lifeless and drifting. Understanding this relationship is vital for climate studies and space missions. It is this unique balance that makes Earth habitable."
    ]
rag_system = AdaptiveRAG(texts)

factual_result = rag_system.answer("What is the distance between the Earth and the Sun?").content
print(f"Answer: {factual_result}")

analytical_result = rag_system.answer("How does the Earth's distance from the Sun affect its climate?").content
print(f"Answer: {analytical_result}")

opinion_result = rag_system.answer("What are the different theories about the origin of life on Earth?").content
print(f"Answer: {opinion_result}")

contextual_result = rag_system.answer("How does the Earth's position in the Solar System influence its habitability?").content
print(f"Answer: {contextual_result}")

  class PydanticAdaptiveRetriever(BaseRetriever):
  class PydanticAdaptiveRetriever(BaseRetriever):


clasiffying query
retrieving factual
enhanced query: Several enhancements are possible, depending on the desired level of precision and context:

**Option 1 (More precise):**

> What is the average distance between the Earth and the Sun (in astronomical units and kilometers)?  Specify whether this refers to the semi-major axis of Earth's orbit or another measure (e.g., perihelion, aphelion).

This clarifies the desired units and accounts for the elliptical nature of Earth's orbit, requesting specific terms for different distances.

**Option 2 (Contextual):**

> What is the distance between the Earth and the Sun at [specific date or time]?  Specify the units used (e.g., kilometers, astronomical units).

This adds a temporal element, making the query relevant to a specific point in time.

**Option 3 (Most precise and comprehensive):**

> What is the current distance between the Earth and the Sun, expressed in both astronomical units and kilometers?  Provide the values for perihelion, aph