# RAG System with Gemini and FAISS
# ================================

In [4]:
import os
from typing import List, Union
from dotenv import load_dotenv
import requests
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAI
from langchain_huggingface import HuggingFaceEmbeddings


## 1. Configuration
First, let's set up the configuration for our RAG system.

In [None]:
load_dotenv()

# Configuration settings
EMBEDDING = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_MODEL = HuggingFaceEmbeddings(model_name=EMBEDDING)
CHUNK_SIZE = 800
CHUNK_OVERLAP = 80
RETRIEVER_K = 4
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY not found in environment variables")

llm = GoogleGenerativeAI(
    api_key=GEMINI_API_KEY,
    model="gemini-2.0-flash",
    verbose=False
)


## 2. Document Processing Functions
Let's define functions to download and process PDF documents.

In [6]:
def download_pdf(url: str, folder: str = 'documents') -> str:
    """
    Downloads PDF from given URL
    """
    os.makedirs(folder, exist_ok=True)
    filename = os.path.basename(url.split('?')[0])
    filepath = os.path.join(folder, filename)

    response = requests.get(url, timeout=10)
    response.raise_for_status()

    with open(filepath, 'wb') as f:
        f.write(response.content)
    return filepath

def process_document(
    documents: Union[List[str], List[Document]], 
    chunk_size: int, 
    chunk_overlap: int
) -> List[Document]:
    """
    Process a list of Document objects or URLs into chunks while preserving parent document relationships.
    """
    if isinstance(chunk_size, str):
        chunk_size = int(chunk_size)
    if isinstance(chunk_overlap, str):
        chunk_overlap = int(chunk_overlap)

    if documents and isinstance(documents[0], str):
        loaded_docs = []
        for url in documents:
            pdf_path = download_pdf(url)
            pdf_docs = PyPDFLoader(pdf_path).load()
            loaded_docs.extend(pdf_docs)
        documents = loaded_docs

    def get_filename(path):
        if not path or path == "unknown":
            return "unknown_document"
        return os.path.basename(path).split('.')[0]

    source_groups = {}
    for doc in documents:
        source = doc.metadata.get("source", "unknown")
        source_groups.setdefault(source, []).append(doc)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )

    all_chunks = []
    for source, docs in source_groups.items():
        docs.sort(key=lambda x: x.metadata.get("page", 0))
        parent_id = get_filename(source)
        chunks = splitter.split_documents(docs)

        for i, chunk in enumerate(chunks):
            page_num = chunk.metadata.get("page", 0)
            chunk_id = f"{parent_id}_p{page_num}_c{i}"
            chunk.metadata.update({
                "parent_id": parent_id,
                "parent_source": source,
                "chunk_id": chunk_id,
                "chunk_index": i,
                "total_chunks": len(chunks)
            })
            all_chunks.append(chunk)

    return all_chunks

## 3. Vector Store Creation
Now let's create a function to build our vector store from documents.

In [7]:
from langchain.vectorstores import FAISS
def process_and_store_documents(documents: List[str], 
                              chunk_size: int, 
                              chunk_overlap: int,
                              embedding_model = EMBEDDING_MODEL,
                              persist_directory: str = None) -> FAISS:
    """
    Process documents into chunks and store them in a FAISS vector database
    while preserving parent document relationships.
    """
    # First process the documents into chunks with parent metadata
    chunks = process_document(documents, chunk_size, chunk_overlap)
    
    vector_store = FAISS.from_documents(chunks, embedding_model)
    
    # Optionally persist to disk
    if persist_directory:
        vector_store.save_local(persist_directory)
    
    print(f"Added {len(chunks)} chunks from {len(set([c.metadata['parent_id'] for c in chunks]))} parent documents to FAISS vector store")
    
    return vector_store



## Long-Term memory storage in Vectorstore

In [8]:
from pathlib import Path
import faiss
from langchain.docstore.in_memory import InMemoryDocstore

def create_memory_vectorstore(embedding_model, path="memory_vectorstore"):
    if Path(path).exists():
        return FAISS.load_local(path, embedding_model, allow_dangerous_deserialization=True)
    
    dim = len(embedding_model.embed_query("test"))
    index = faiss.IndexFlatL2(dim)
    docstore = InMemoryDocstore()
    index_to_docstore_id = {}
    
    return FAISS(
        embedding_function=embedding_model,
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id
    )

def store_to_memory_vectorstore(question, answer, vectorstore, embedding_model):
    content = f"Q: {question}\nA: {answer}"
    doc = Document(page_content=content, metadata={"type": "chat_memory"})
    vectorstore.add_documents([doc])
    vectorstore.save_local("memory_vectorstore")
    
def get_relevant_memory(query, vectorstore, k=3):
    try:
        memory_docs = vectorstore.similarity_search(query, k=k)
        return "\n---\n".join([doc.page_content for doc in memory_docs])
    except (IndexError, ValueError) as e:
        # Common errors for empty vectorstore or dimension mismatch
        print(f"Skipping memory retrieval (reason: {str(e)})")
        return ""



In [9]:
from langchain_community.vectorstores import FAISS
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
from sentence_transformers import CrossEncoder
from langchain.schema import BaseRetriever, Document

def create_parent_document_reranker(vectorstore, reranker_model, top_k_chunks=20, top_k_parents=4):
    class FunctionalRerankerRetriever(BaseRetriever):
        def get_relevant_documents(self, query: str) -> list[Document]:
            relevant_chunks_with_scores = vectorstore.similarity_search_with_score(query, k=top_k_chunks)
            chunks = [doc for doc, _ in relevant_chunks_with_scores]
            scores = [score for _, score in relevant_chunks_with_scores]

            parent_docs = {}
            for chunk, score in zip(chunks, scores):
                parent_id = chunk.metadata.get("parent_id") or chunk.metadata.get("doc_id", f"doc_{len(parent_docs)}")
                if parent_id not in parent_docs:
                    parent_docs[parent_id] = {
                        "chunks": [],
                        "scores": [],
                        "source": chunk.metadata.get("parent_source", "unknown")
                    }
                parent_docs[parent_id]["chunks"].append(chunk)
                parent_docs[parent_id]["scores"].append(score)

            parent_list = []
            for parent_id, parent in parent_docs.items():
                parent["chunks"].sort(key=lambda x: (x.metadata.get("page", 0), x.metadata.get("chunk_index", 0)))
                full_text = "\n".join([chunk.page_content for chunk in parent["chunks"]])
                rerank_score = reranker_model.predict([(query, full_text)])[0]

                for c in parent["chunks"]:
                    c.metadata["rerank_score"] = rerank_score

                parent_list.append({
                    "id": parent_id,
                    "chunks": parent["chunks"],
                    "rerank_score": rerank_score,
                    "source": parent["source"]
                })

            parent_list.sort(key=lambda x: x["rerank_score"], reverse=True)

            top_docs = []
            for parent in parent_list[:top_k_parents]:
                top_docs.extend(parent["chunks"])

            return top_docs

        async def aget_relevant_documents(self, query: str):
            raise NotImplementedError("Async version not implemented.")

    return FunctionalRerankerRetriever()


## Different types of prompts
Now let's create a function to build our vector store from documents.

In [10]:
from langchain.prompts import PromptTemplate

def get_prompt(prompt_type: str, role: str = "Ai Assistant") -> PromptTemplate:
    if prompt_type == "zero_shot":
        template = """Use the following context to answer the question. Be clear and concise.

        Context: {context}

        Question: {question}

        Answer:"""
    elif prompt_type == "explain_like_5":
        template = """Use the following pieces of context to answer the question. Explain like you are talking to a 5-year-old. 
        If the question is not related to the context, say "I don't know". If you don't know the answer, just say that you don't know.

        Context: {context}

        Question: {question}

        Provide a clear and concise answer.

        Answer:"""
    elif prompt_type == "cot":
        template = """Use the following context to answer the question. Think step-by-step and explain your reasoning.

        Context: {context}

        Question: {question}

        Let's think step by step:

        Answer:"""
    elif prompt_type == "elaborate":
        template = """Use the following context to answer the question in a detailed, formal tone. If you can't answer, say "I don't know".

        Context: {context}

        Question: {question}

        Detailed answer:"""
    elif prompt_type == "meta":
        template = """You are an AI assistant tasked with answering the question using the provided context. 
        First, generate an optimal prompt that would help an LLM perform this task effectively.
        Then, respond to that prompt yourself to complete the task.
        Reflect on your reasoning process as you answer. Clearly state what you know, what you are assuming, and how confident you are.

        Context:
        {context}

        Question:
        {question}

        Answer (include reasoning, assumptions, and confidence level):"""
    elif prompt_type == "role":
        template = """You are acting as {role}. Use the following context to answer the question appropriately for your role.
        If the question is not related to the context, say "I don't know". If you're unsure of the answer, acknowledge that.

        Context:
        {context}

        Question:
        {question}

        As a {role}, your answer:"""
    elif prompt_type == "react":
        template = """You are an intelligent assistant that reasons step-by-step and can use external context to answer questions.

        Use the following format:

        Question: {question}
        Thought: Think about what you need to find.
        Action: Look up relevant information in the context.
        Observation: Summarize what the context says.
        Thought: Reflect on how the observation answers the question.
        Final Answer: Give a complete and clear answer.

        Context:
        {context}

        Now follow the steps to answer:
        """
    elif prompt_type == "verify":
        template = """
        Given the context and the answer, verify if the answer is fully supported. 
        Respond with YES or NO, then explain briefly.
        
        Context:
        {context}
        
        Answer:
        {answer}
        
        Is this answer verifiable?
        """
    else:
        raise ValueError(f"Unknown prompt type: {prompt_type}")

    return PromptTemplate.from_template(template)


## 4. RAG Chain Creation
Let's create our RAG chain with the Gemini model.

In [11]:
from langchain.schema import BaseRetriever
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from sentence_transformers import CrossEncoder


def create_rag_chain(
    vectorstore,
    llm,
    prompt_type: str = "zero_shot",
    use_memory: bool = False,
    use_reranking: bool = False,
    reranker_model=None,
    retriever_k: int = 4,
    top_k_chunks: int = 20,
    rewrite_prompt: bool = False
):
    """
    Create a RAG chain with optional memory, reranking, and query rewriting.
    """
    prompt = get_prompt(prompt_type)

    memory = (
        ConversationBufferMemory(
            memory_key="chat_history",
            input_key="question",
            output_key="answer",
            return_messages=True
        )
        if use_memory else None
    )

    if use_reranking:
        reranker_model = reranker_model or CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
        retriever = create_parent_document_reranker(
            vectorstore=vectorstore,
            reranker_model=reranker_model,
            top_k_chunks=top_k_chunks,
            top_k_parents=retriever_k
        )
    else:
        retriever = vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": retriever_k}
        )

    base_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        combine_docs_chain_kwargs={"prompt": prompt},
        return_source_documents=True,
        memory=memory,
        verbose=False
    )

    if rewrite_prompt:
        rewrite_template = (
            "Given the chat history:\n\n{chat_history}\n\n"
            "And the new question:\n{question}\n\n"
            "Rewrite the question to be standalone, give only one best example:"
            if use_memory else
            "Given the new question: {question}\n\nRewrite the question to be standalone, give only one best example:"
        )

        rewrite_prompt_obj = PromptTemplate.from_template(rewrite_template)
        rewriter_chain = LLMChain(llm=llm, prompt=rewrite_prompt_obj)

        def wrapped_chain(inputs):
            rewrite_input = {"question": inputs["question"]}

            chat_history_text = ""
            if use_memory and memory is not None:
                chat_history = memory.load_memory_variables({}).get("chat_history", [])
                chat_history_text = "\n".join(
                    [f"{msg.type.title()}: {msg.content}" for msg in chat_history]
                )
                rewrite_input["chat_history"] = chat_history_text

            standalone_question = rewriter_chain.run(rewrite_input)

            # Explicitly passing 'chat_history' to base_chain fixes the error
            result = base_chain({
                "question": standalone_question,
                "chat_history": chat_history_text if use_memory else []
            })

            result.update({
                "standalone_question": standalone_question,
                "original_question": inputs["question"]
            })
            return result

        return wrapped_chain, memory

    return base_chain, memory

In [12]:
def print_result_summary(result):
    print(f"\n🧠 Answer:\n{result['answer']}\n")

    print("🔎 Retrieved Documents:")
    for i, doc in enumerate(result["source_documents"], 1):
        print(f"\n📄 Chunk #{i}")
        print(f"📚 Source: {doc.metadata.get('parent_source', 'unknown')}")
        print(f"📄 Page: {doc.metadata.get('page', 'unknown')}")
        print(f"🏷️ Rerank Score: {doc.metadata.get('rerank_score', 'N/A')}")
        print("📝 Excerpt:")
        print(doc.page_content.strip()[:500] + ("..." if len(doc.page_content) > 500 else ""))


## 5. Building the Complete RAG System
Now let's put everything together to build our RAG system.

In [13]:
# Prepare documents
docs = ['https://assets.pokemon.com/assets/cms2/pdf/trading-card-game/rulebook/sm7_rulebook_en.pdf','https://media.wizards.com/images/magic/tcg/resources/rules/MagicCompRules_21031101.pdf','https://cdn.1j1ju.com/medias/d3/22/83-monopoly-rulebook.pdf','https://fgbradleys.com/wp-content/uploads/rules/Monopoly_Rules.pdf?srsltid=AfmBOorDaiGKyaEWIQFd-au0rl8-tKoqedlzy_6r4EETpj_ZMIUYsNMQ']
process_and_store_documents(docs,'800','80', persist_directory="./faiss_index")
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
faiss_db = FAISS.load_local("./faiss_index", EMBEDDING_MODEL, allow_dangerous_deserialization=True)

Added 1074 chunks from 4 parent documents to FAISS vector store


You are trying to use a model that was created with Sentence Transformers version 4.1.0.dev0, but you're currently using version 4.0.2. This might cause unexpected behavior or errors. In that case, try to update to the latest version.


In [14]:
rag_chain, rag_memory = create_rag_chain(
    vectorstore=faiss_db,
    llm=llm,
    prompt_type="zero_shot",
    use_memory=False,
    reranker_model=reranker,
    retriever_k=3,
    use_reranking=False,
    top_k_chunks=10,
    rewrite_prompt=True
)

  rewriter_chain = LLMChain(llm=llm, prompt=rewrite_prompt_obj)


## 7. Testing with Sample Queries
Let's test our RAG system with some sample queries.

In [15]:
faiss_memory = create_memory_vectorstore(
    embedding_model=EMBEDDING_MODEL, path="memory_vectorstore"
)

# Test some queries
test_queries = [
    "what is EX card?",
    "what is haste?",
    "how to build a hotel?"
]

for query in test_queries:
    print(f"\n🔍 Query: {query}")
    memory_context = get_relevant_memory(query, faiss_memory)
    result = rag_chain({"question": query, "chat_history": []})
    print(f"\n🔍 Updated Query: "+result["standalone_question"])
    print_result_summary(result)
    store_to_memory_vectorstore(
        question=query,
        answer=result["answer"],
        vectorstore=faiss_memory,         
        embedding_model=EMBEDDING_MODEL
    )

  standalone_question = rewriter_chain.run(rewrite_input)



🔍 Query: what is EX card?


  result = base_chain({



🔍 Updated Query: 

What is an EX card in the context of Pokemon Trading Card Game?

Answer:
An EX card is a special type of Pokemon card in the Pokemon Trading Card Game. It was introduced in the EX Series, which began in 2001. EX cards feature powerful Pokemon with higher stats and abilities compared to regular cards. For example, Charizard EX from the Charizard EX Holofoil card (EX Holo: Charizard #112/112) is an EX card. It has a higher HP and attack power than the regular Charizard card. Additionally, it has special abilities like "Double Dragon Dance," which allows it to attack twice in one turn.

🧠 Answer:
 (Alternative)

An EX card is a type of Pokemon card in the Pokemon Trading Card Game that represents a more powerful version of a Pokemon. These cards were introduced in the EX Series and have distinct characteristics, such as a larger size, a special symbol (EX), and unique abilities. When an EX card is Knocked Out, the opponent takes two Prize cards instead of one. Examples




🔍 Updated Query: 

What is haste, and how does it affect a character in a role-playing game?

Example answer:

Haste is a status effect in many role-playing games that increases a character's speed and often grants additional benefits, such as increased attack speed or the ability to perform multiple actions in a single turn. For instance, in the game Dungeons & Dragons, a character under the effect of a haste spell can perform twice as many actions in a turn as they normally would. However, the use of haste often comes with drawbacks, such as increased risk of error or reduced effectiveness of certain abilities.

🧠 Answer:


Haste is a static ability in the Magic: The Gathering card game that allows a creature to attack even if it hasn't been controlled by its controller continuously since their most recent turn began. It doesn't have any direct impact on a character in a role-playing game, but it can be simulated by granting a character an extra action or allowing them to take actio




🔍 Updated Query: 

What are the steps to build a luxury hotel from the ground up?

1. Location Selection: Choose a prime location with easy access to transportation, tourist attractions, and business centers.
2. Market Research: Conduct thorough market research to understand the demand for hotels in the area, competition, and target audience.
3. Business Plan: Develop a comprehensive business plan, including financial projections, marketing strategy, and operational structure.
4. Design and Architecture: Hire a reputable architectural firm to design the hotel, ensuring it aligns with local building codes and regulations.
5. Permits and Approvals: Obtain all necessary permits and approvals from local authorities, including zoning permits, building permits, and environmental permits.
6. Construction: Hire a reputable construction company to build the hotel, ensuring they have experience in constructing hotels and can deliver the project on time and within budget.
7. Furnishing and Decor

## Rewritten prompt

In [16]:
test_queries = [
    "what is EX card?",
    "what is haste?",
    "how to build a hotel?"
]
for query in test_queries:
    result = rag_chain({"question": query, "chat_history": []})
    print_result_summary(result)
    store_to_memory_vectorstore(
        question=query,
        answer=result["answer"],
        vectorstore=faiss_memory,         
        embedding_model=EMBEDDING_MODEL
    )




🧠 Answer:
 (Alternative)

An EX card is a type of Pokemon card in the Pokemon Trading Card Game that represents a more powerful version of a Pokemon. These cards were introduced in the EX Series and have distinct characteristics, such as a larger size, a special symbol (EX), and unique abilities. When an EX card is Knocked Out, the opponent takes two Prize cards instead of one. Examples of EX cards include Yveltal-EX and Charizard-EX.

🔎 Retrieved Documents:

📄 Chunk #1
📚 Source: documents\sm7_rulebook_en.pdf
📄 Page: 27
🏷️ Rerank Score: N/A
📝 Excerpt:
28
Pokémon T rading Card Game Rules
APPENDIX J: POKÉMON-EX
Pokémon-EX are powerful Pokémon that show off a Pokémon with more HP and stronger attacks than regular 
Pokémon, but there are risks to playing these powered-up Pokémon!
SPECIAL RULES FOR POKÉMON -EX
The EX is part of a Pokémon-EX’s name. Thus Yveltal and Yveltal-EX 
have different names, so you can have up to 4 of each in your deck 
if you wish.
When one of your Pokémon-EX is Kn




🧠 Answer:


Haste is a static ability in the Magic: The Gathering card game that allows a creature to attack even if it hasn't been controlled by its controller continuously since their most recent turn began. It doesn't have any direct impact on a character in a role-playing game, but it can be simulated by granting a character an extra action or allowing them to take actions out of turn. However, it's important to note that the rules for haste in Magic: The Gathering and in role-playing games may differ significantly.

🔎 Retrieved Documents:

📄 Chunk #1
📚 Source: documents\MagicCompRules_21031101.pdf
📄 Page: 102
🏷️ Rerank Score: N/A
📝 Excerpt:
702.8b Multiple instances of flash on the same object are redundant. 
 
702.9. Flying 
 
702.9a Flying is an evasion ability. 
 
702.9b A creature with flying can’t be blocked except by creatures with flying and/or reach. A 
creature with flying can block a creature with or without flying. (See rule 509, “Declare 
Blockers Step,” and rule 702.




🧠 Answer:


To build a luxury hotel from the ground up in Monopoly, follow these steps:

1. Acquire a complete color-group of properties.
2. Build four houses on each property of the color-group.
3. Once you have four houses on each property, you may buy a hotel from the Bank and erect it on any property of the color-group.
4. Return the four houses from that property to the Bank and pay the price for the hotel as shown on the Title Deed card.
5. Ensure there are houses and hotels available for sale in the Bank before proceeding.
6. If necessary, participate in an auction to purchase the hotel if there are multiple players vying for the same property.
7. Sell back any unnecessary houses or hotels to the Bank at any time for one-half the price paid for them.
8. Remember that only one hotel may be erected on any one property.

🔎 Retrieved Documents:

📄 Chunk #1
📚 Source: documents\Monopoly_Rules.pdf
📄 Page: 2
🏷️ Rerank Score: N/A
📝 Excerpt:
HOTELS...When a player has four houses on each

## RAGAS EVALUATION

In [17]:
from datasets import Dataset

# Example evaluation data
eval_data = [
    {
        "question": "What is a Pokémon EX card?",
        "answer": "A Pokémon-EX card is a powerful type of card with higher HP and stronger attacks. When it’s knocked out, the opponent takes 2 Prize cards.",
        "contexts": [
            "Pokémon-EX cards have more HP and stronger attacks than regular Pokémon cards. If one is knocked out, the opponent takes 2 Prize cards."
        ],
        "reference": "Pokémon-EX cards are a special kind of card that have more HP and stronger attacks than normal cards. When knocked out, they give the opponent 2 Prize cards."
    },
    {
        "question": "What does haste do in Magic: The Gathering?",
        "answer": "Haste allows creatures to attack and use abilities immediately after being played, ignoring summoning sickness.",
        "contexts": [
            "Creatures with haste can attack or use tap/untap abilities even if they haven't been under the player's control since the beginning of the turn."
        ],
        "reference": "Haste is a keyword that allows a creature to attack or use abilities as soon as it enters the battlefield, bypassing summoning sickness."
    },
    {
        "question": "How can a player build a hotel in Monopoly?",
        "answer": "To build a hotel, a player must first own all properties in a color group and build four houses on each property. Then, they can exchange the houses for a hotel.",
        "contexts": [
            "Once a player has four houses on each property in a color group, they may buy a hotel and place it on any of those properties. The four houses are returned to the bank."
        ],
        "reference": "A player must have four houses on every property in a color set to buy a hotel. The player exchanges the houses for one hotel and pays the bank the cost listed on the title deed."
    }
]

eval_dataset = Dataset.from_list(eval_data)


In [18]:
from ragas.metrics import faithfulness, answer_relevancy, context_precision

metrics = [faithfulness, answer_relevancy, context_precision]


In [19]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

# Wrap the LLM
evaluator_llm = LangchainLLMWrapper(llm)

# Evaluate the dataset
results = evaluate(dataset=eval_dataset, metrics=metrics, llm=evaluator_llm)

# Display the results
print(results)

Evaluating:   0%|          | 0/9 [00:00<?, ?it/s]

Exception raised in Job[6]: ClientResponseError(402, message='Payment Required', url='https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.2')
Exception raised in Job[4]: ClientResponseError(402, message='Payment Required', url='https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.2')
Exception raised in Job[1]: ClientResponseError(402, message='Payment Required', url='https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.2')
Exception raised in Job[7]: ClientResponseError(402, message='Payment Required', url='https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.2')
Exception raised in Job[8]: ClientResponseError(402, message='Payment Required', url='https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.2')
Exception raised in Job[2]: ClientResponseError(402, message='Payment Required', url='https://router.huggingface.co/hf-inference/models

{'faithfulness': nan, 'answer_relevancy': nan, 'context_precision': nan}


In [22]:
import pandas as pd
from itertools import product
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from sentence_transformers import CrossEncoder
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import faithfulness, answer_relevancy, context_precision
from datasets import Dataset

# Assuming existing helper functions are imported:
# get_prompt(), create_parent_document_reranker(), create_memory_vectorstore(), get_relevant_memory(), store_to_memory_vectorstore()
# and previously defined create_rag_chain().

queries = [
    "what is EX card?",
    "what is haste?",
    "how to build a hotel?"
]

prompt_types = ["zero_shot", "react", "cot"]

configurations = [
    {"use_memory": False, "rewrite_prompt": False, "use_reranking": False, "prompt_type": p} for p in prompt_types]
#  + [
#     {"use_memory": False, "rewrite_prompt": True, "use_reranking": False, "prompt_type": p} for p in prompt_types
# ] + [
#     {"use_memory": True, "rewrite_prompt": False, "use_reranking": False, "prompt_type": p} for p in prompt_types
# ] + [
#     {"use_memory": True, "rewrite_prompt": True, "use_reranking": False, "prompt_type": p} for p in prompt_types
# ] + [
#     {"use_memory": False, "rewrite_prompt": False, "use_reranking": True, "prompt_type": p} for p in prompt_types
# ] + [
#     {"use_memory": True, "rewrite_prompt": True, "use_reranking": True, "prompt_type": p} for p in prompt_types
# ]

EMBEDDING_MODEL = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
faiss_memory = create_memory_vectorstore(embedding_model=EMBEDDING_MODEL, path="memory_vectorstore")

wrapped_llm = LangchainLLMWrapper(llm)
metrics = [faithfulness, answer_relevancy]

eval_data = [
    {
        "question": "What is a Pokémon EX card?",
        "answer": "A Pokémon-EX card is a powerful type of card with higher HP and stronger attacks. When it’s knocked out, the opponent takes 2 Prize cards.",
        "contexts": [
            "Pokémon-EX cards have more HP and stronger attacks than regular Pokémon cards. If one is knocked out, the opponent takes 2 Prize cards."
        ],
        "reference": "Pokémon-EX cards are a special kind of card that have more HP and stronger attacks than normal cards. When knocked out, they give the opponent 2 Prize cards."
    },
    {
        "question": "What does haste do in Magic: The Gathering?",
        "answer": "Haste allows creatures to attack and use abilities immediately after being played, ignoring summoning sickness.",
        "contexts": [
            "Creatures with haste can attack or use tap/untap abilities even if they haven't been under the player's control since the beginning of the turn."
        ],
        "reference": "Haste is a keyword that allows a creature to attack or use abilities as soon as it enters the battlefield, bypassing summoning sickness."
    },
    {
        "question": "How can a player build a hotel in Monopoly?",
        "answer": "To build a hotel, a player must first own all properties in a color group and build four houses on each property. Then, they can exchange the houses for a hotel.",
        "contexts": [
            "Once a player has four houses on each property in a color group, they may buy a hotel and place it on any of those properties. The four houses are returned to the bank."
        ],
        "reference": "A player must have four houses on every property in a color set to buy a hotel. The player exchanges the houses for one hotel and pays the bank the cost listed on the title deed."
    }
]


In [23]:
import os
from dotenv import load_dotenv
from huggingface_hub import InferenceClient
from langchain_huggingface import HuggingFaceEndpoint
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# Load environment variables
load_dotenv()

# Initialize Hugging Face client for discriminator
hf_api_key = os.getenv("HUGGINGFACE_API_KEY")
hf_model_name = "mistralai/Mistral-7B-Instruct-v0.2" # Replace with your preferred model
discriminator_client = InferenceClient(
    model=hf_model_name,
    token=hf_api_key
)
llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    task="text-generation",
    huggingfacehub_api_token=hf_api_key,
    temperature=0.1,
    max_new_tokens=512
)
# Function to use HuggingFace as discriminator
def hf_discriminator_eval(query, answer):
    try:
        prompt = f"""
        Question: {query} 
        Answer: {answer} 
        
        Is this answer accurate and helpful? Respond with 1 to 5, where 1 is not helpful at all and 5 is very helpful.
        """
        
        response = discriminator_client.text_generation(
            prompt,
            max_new_tokens=100,
            temperature=0.1,
            repetition_penalty=1.1
        )
        
        return response
    except Exception as e:
        print(f"Error with discriminator evaluation: {e}")
        return "Error in evaluation"

# Main evaluation loop
for query in queries:
    for config in configurations:
        rag_chain, memory = create_rag_chain(
            vectorstore=faiss_memory,
            llm=llm,
            **config
        )
        result = rag_chain({"question": query, "chat_history": []})

        eval_data.append({
            "question": query,
            "answer": result["answer"],
            "contexts": [doc.page_content for doc in result["source_documents"]],
            "configuration": str(config),
            "discriminator_eval": hf_discriminator_eval(query, result["answer"])
        })

# Convert eval_data to the format expected by RAGAS
eval_dataset = Dataset.from_list([
    {
        "question": row["question"],
        "answer": row["answer"],
        "contexts": row["contexts"],
        "configuration": row.get("configuration", "manual_eval"),
        "discriminator_eval": row.get("discriminator_eval", "")
    } for row in eval_data
])


# Run RAGAS evaluation
ragas_scores = evaluate(dataset=eval_dataset, metrics=metrics, llm=wrapped_llm)

# Combine RAGAS results with metadata
results = []
for i, row in enumerate(eval_data):
    results.append({
        "query": row["question"],
        "configuration": row.get("configuration", "N/A"),
        "answer": row["answer"],
        "faithfulness": ragas_scores["faithfulness"][i],
        "relevance": ragas_scores["answer_relevancy"][i],
        "context_precision": ragas_scores["context_precision"][i],
        "discriminator_eval": row["discriminator_eval"]
    })

# Generate DataFrame
results_df = pd.DataFrame(results)

# Display DataFrame
display(results_df)

HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/hf-inference/models/mistralai/Mistral-7B-Instruct-v0.2 (Request ID: Root=1-67fc22f2-70de8ff36cf19ed05e496a48;8dd2ed09-f643-4d90-8d64-f0b3c0de3172)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.