In [None]:
!pip install python-dotenv 
!pip install langchain -U langchain-community 
!pip install PyMuPDF 
!pip install rank-bm25 
!pip install deepeval 
!pip install langchain_ollama 
!pip install pypdf 
!pip install sentence-transformers 
!pip install faiss-gpu

curl -fsSL https://ollama.com/install.sh | sh

ollama serve

ollama pull llama3.1

In [1]:
from concurrent.futures import ThreadPoolExecutor
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_community.llms import Ollama
from langchain.schema import Document
import textwrap
import fitz
import asyncio
import random
import numpy as np
import json
from typing import List
from rank_bm25 import BM25Okapi

In [2]:
# 1. Vectorize Tab Replacement for Multiple Docs
def replace_tabs_with_spaces(docs: List[Document]) -> List[Document]:
    for doc in docs:
        doc.page_content = doc.page_content.replace('\t', ' ')
    return docs


# 2. Faster Text Wrapping Using List Comprehension
def wrap_text_with_width(text: str, width=120) -> str:
    return "\n".join([text[i:i+width] for i in range(0, len(text), width)])


# 3. Use Threading for Loading PDF and Vectorizing
def pdf_to_vectorstore(path: str, chunk_size=1000, chunk_overlap=200) -> FAISS:
    loader = PyPDFLoader(path)
    docs = loader.load()

    # Perform text splitting and cleaning in parallel
    with ThreadPoolExecutor() as executor:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len)
        chunks = list(executor.map(lambda d: text_splitter.split_documents([d]), docs))

    chunks = [item for sublist in chunks for item in sublist]  # Flatten the list
    cleaned_chunks = replace_tabs_with_spaces(chunks)

    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(cleaned_chunks, embeddings)
    
    return vectorstore


# 4. Cache Document Contexts to Avoid Repeated Retrievals
context_cache = {}

def get_context_for_question(question: str, retriever) -> List[str]:
    if question in context_cache:
        return context_cache[question]

    docs = retriever.get_relevant_documents(question)
    context = [doc.page_content for doc in docs]
    context_cache[question] = context
    return context


# 5. Async BM25 Retrieval for Faster Querying
async def retrieve_with_bm25(bm25: BM25Okapi, texts: List[str], query: str, k: int = 5) -> List[str]:
    query_tokens = query.split()
    bm25_scores = bm25.get_scores(query_tokens)
    top_k_indices = np.argsort(bm25_scores)[::-1][:k]
    return [texts[i] for i in top_k_indices]


# 6. Faster Retry Mechanism for Backoff
async def retry_with_backoff(coroutine, max_retries=3):
    for attempt in range(max_retries):
        try:
            return await coroutine
        except RateLimitError:
            if attempt == max_retries - 1:
                raise
            await asyncio.sleep(2 ** attempt + random.uniform(0, 1))


# 7. Batch-Process Retrieval
def batch_retrieve_context(questions: List[str], retriever) -> List[List[str]]:
    return [get_context_for_question(q, retriever) for q in questions]


# 8. Async Batch Answer Generation
async def batch_generate_answers(questions: List[str], contexts: List[str], qa_chain) -> List[dict]:
    tasks = []
    for question, context in zip(questions, contexts):
        input_data = {"question": question, "context": context}
        tasks.append(qa_chain.invoke(input_data))

    results = await asyncio.gather(*tasks)
    return [{"answer": res.answer, "context": ctx, "question": q} for res, q, ctx in zip(results, questions, contexts)]


# 9. Optimized Evaluation Using Numpy for Vector Operations
def run_evaluation(retriever, num_questions: int = 5) -> None:
    llm = initialize_llm_model()
    qa_chain = create_qa_chain(llm)

    with open("../data/q_a.json", "r", encoding="utf-8") as file:
        q_a_data = json.load(file)

    questions = [qa["question"] for qa in q_a_data][:num_questions]
    ground_truth_answers = [qa["answer"] for qa in q_a_data][:num_questions]

    # Batch process questions
    retrieved_docs_batch = batch_retrieve_context(questions, retriever)
    retrieved_contexts = [" ".join(doc) for doc in retrieved_docs_batch]

    # Async answer generation for all questions
    loop = asyncio.get_event_loop()
    generated_answers = loop.run_until_complete(batch_generate_answers(questions, retrieved_contexts, qa_chain))

    # Score computations
    correctness_scores = np.array([correctness_eval(g["answer"], gt) for g, gt in zip(generated_answers, ground_truth_answers)])
    faithfulness_scores = np.array([faithfulness_eval(g["answer"], r) for g, r in zip(generated_answers, retrieved_docs_batch)])
    relevance_scores = np.array([relevance_eval(q, g["answer"]) for q, g in zip(questions, generated_answers)])

    print(f"Avg Correctness: {correctness_scores.mean()}\nAvg Faithfulness: {faithfulness_scores.mean()}\nAvg Relevance: {relevance_scores.mean()}")


# Define Correctness, Faithfulness, Relevance Evaluation
def correctness_eval(generated_answer: str, ground_truth: str) -> float:
    return 1.0 if generated_answer.strip().lower() == ground_truth.strip().lower() else 0.0


def faithfulness_eval(generated_answer: str, retrieved_documents: List[str]) -> float:
    context_string = " ".join(retrieved_documents).lower()
    return 1.0 if generated_answer.strip().lower() in context_string else 0.0


def relevance_eval(question: str, generated_answer: str) -> float:
    return 1.0 if question.lower() in generated_answer.lower() else 0.0


# Initialize Ollama Model
def initialize_llm_model():
    return Ollama(model="llama3.1", temperature=0)


# Create the Question-Answer Chain
def create_qa_chain(llm):
    question_answer_prompt_template = """ 
    For the question below, provide a concise but suffice answer based ONLY on the provided context:
    {context}
    Question:
    {question}
    """

    question_answer_prompt = PromptTemplate(template=question_answer_prompt_template, input_variables=["context", "question"])
    question_answer_chain = question_answer_prompt | llm
    return question_answer_chain


# Read PDF Content to String
def read_pdf_content(path: str) -> str:
    doc = fitz.open(path)
    content = ""
    for page_num in range(len(doc)):
        page = doc[page_num]
        content += page.get_text()
    return content


# HyDERetriever Class with Hypothetical Document Generation
class HyDERetriever:
    def __init__(self, files_path, chunk_size=500, chunk_overlap=100):
        self.llm = initialize_llm_model()
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.vectorstore = pdf_to_vectorstore(files_path, chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap)

        self.hyde_prompt = PromptTemplate(
            input_variables=["query", "chunk_size"],
            template="""Given the question '{query}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
                        The document size should be exactly {chunk_size} characters.""",
        )
        self.hyde_chain = self.hyde_prompt | self.llm

    def generate_hypothetical_document(self, query):
        input_variables = {"query": query, "chunk_size": self.chunk_size}
        result = self.hyde_chain.invoke(input_variables)
        return result if isinstance(result, str) else result.content

    def retrieve(self, query, k=3):
        hypothetical_doc = self.generate_hypothetical_document(query)
        similar_docs = self.vectorstore.similarity_search(hypothetical_doc, k=k)
        return similar_docs, hypothetical_doc


In [4]:
# Example Usage
if __name__ == "__main__":
    path = path = "/mnt/e/Virtual Agent/Doc/Facebook.pdf"
    retriever = HyDERetriever(path)

    test_query = "What were the primary reasons for Facebook's rebranding to Meta Platforms, Inc. in 2021?"
    results, hypothetical_doc = retriever.retrieve(test_query)

    docs_content = [doc.page_content for doc in results]
    print("Hypothetical Document:\n")
    print(wrap_text_with_width(hypothetical_doc) + "\n")

    for i, doc in enumerate(docs_content):
        print(f"Context {i+1}:\n{wrap_text_with_width(doc)}\n")

Hypothetical Document:

**Meta Platforms, Inc. Rebranding Report**

**Executive Summary:**
In October 2021, Facebook, Inc. rebranded to Meta Pla
tforms, Inc., marking a significant shift in the company's identity. This report outlines the primary reasons behind thi
s transformation.

**Reasons for Rebranding:**

1. **Expansion of Services:** The rebranding reflects Facebook's evoluti
on into a comprehensive metaverse platform, encompassing virtual reality (VR), augmented reality (AR), and online social
 interactions.
2. **Diversification of Business:** By separating the company name from its primary product, Meta Platfor
ms, Inc. aims to distance itself from controversies surrounding Facebook, while emphasizing its broader technological am
bitions.
3. **Preparation for Emerging Technologies:** The rebranding positions the company for future growth in emergin
g technologies like VR and AR, which are expected to play a significant role in shaping the digital landscape.

**Conclu
sion:**
