In [1]:
# mount the google drive to use the saved database instead of recreating it on every run.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install haystack-ai chroma-haystack langchain-huggingface ragas



# RAG pipeline with Evaluation.
when evaluating RAG pipeline there are three main types of evaluation:
1. evaluating the retriever
2. evaluating the generator
3. evaluating the system as a whole


### installing dataset for evaluation

going to use Arabic Reading Comprehension Dataset (ARCD):


In [2]:
import gdown
gdown.download(id="1zncZ2bHLt5GEsLs0jpmF9q5NI9DjP41i", output='/content/arcd.json')

Downloading...
From: https://drive.google.com/uc?id=1zncZ2bHLt5GEsLs0jpmF9q5NI9DjP41i
To: /content/arcd.json
100%|██████████| 2.43M/2.43M [00:00<00:00, 19.8MB/s]


'/content/arcd.json'

In [3]:
import json

try:
    with open("/content/arcd.json", "r", encoding="utf-8") as f:
        data = json.load(f)
except json.JSONDecodeError as e:
    print(f"JSON Decode Error: {e}")

In [4]:
print(data['data'][0:2])

[{'title': 'جمال خاشقجي', 'paragraphs': [{'context': 'جمال أحمد حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 2018)، صحفي وإعلامي سعودي، رأس عدّة مناصب لعدد من الصحف في السعودية، وتقلّد منصب مستشار، كما أنّه مدير عام قناة العرب الإخبارية سابقًا.', 'qas': [{'question': ' - من هو جمال أحمد حمزة خاشقجي؟ ', 'id': '969331847966', 'answers': [{'text': 'صحفي وإعلامي', 'answer_start': 73}]}, {'question': ' - متى ولد جمال أحمد حمزة خاشقجي وتوفي؟ ال', 'id': '115150665555', 'answers': [{'text': 'حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 2018)،', 'answer_start': 10}]}, {'question': ' - في أي مدينة ولد جمال أحمد حمزة خاشقجي؟ ال', 'id': '74212080718', 'answers': [{'text': 'المدينة المنورة', 'answer_start': 39}]}]}, {'context': 'جمال أحمد حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 2018)، صحفي وإعلامي سعودي، رأس عدّة مناصب لعدد من الصحف في السعودية، وتقلّد منصب مستشار، كما أنّه مدير عام قناة العرب الإخبارية سابقًا. ويكتب عموداً في صحيفة واشنطن بوست منذ 2017، وُصف ف

#### the structure if the ARCD dataset
<
file.json<br>
├── "data"<br>
│   └── [i]<br>
│       ├── "paragraphs"<br>
│       │   └── [j]<br>
│       │       ├── "context": "paragraph text"<br>
│       │       └── "qas"<br>
│       │           └── [k]<br>
│       │               ├── "answers"<br>
│       │               │   └── [l]<br>
│       │               │       ├── "answer_start": N<br>
│       │               │       └── "text": "answer"<br>
│       │               ├── "id": "<uuid>"<br>
│       │               └── "question": "paragraph question?"<br>
│       └── "title": "document id"<br>
└── "version": 1.1<br>

let's extract the contexts to files

### building the Pipeline

In [3]:
!pip install --upgrade huggingface_hub



In [11]:
# necessary to run models.
from google.colab import userdata
HF_API_TOKEN = userdata.get('HF_API_TOKEN')
print("okay")

okay


In [14]:
from haystack import component
from haystack import Pipeline, Document
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.utils import Secret

from tqdm.notebook import tqdm

In [8]:
from chromadb.utils import embedding_functions

document_store = ChromaDocumentStore(collection_name='arcd_evaluation', persist_path='/content/vectordb')

indexing pipeline

In [9]:
import os
import json
import re

from typing import List, Dict, Any


@component
class ArcdJsonInjestor:
    @classmethod
    def validate(cls, input_file: str):
        if not os.path.exists(input_file):
            raise ValueError(f"File '{input_file}' does not exist.")

    @staticmethod
    def sanitize_name(name: str) -> str:
      return re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', name)

    @component.output_types(documents=List[Document], qacs=List[Dict[str, Any]])
    def run(self, input_file: str):
        documents = []
        questions = []

        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)

        for doc_data in data.get("data", []):
            title = doc_data.get("title", "untitled")
            sanitized_title = self.sanitize_name(title)

            for para in doc_data.get("paragraphs", []):
                context = para.get("context", "").strip()
                # Create Document object
                doc = Document(
                    content=context,
                    meta={"title": sanitized_title}
                )
                documents.append(doc)

                # Process associated questions
                para_questions = para.get("qas", [])
                for qas in para_questions:
                    questions.append({
                        "question": qas.get("question", ""),
                        "ground_truth_context": doc,
                        "ground_truth_answer": qas.get('answers')[0].get('text')
                    })

        output = {
            "documents": documents,
            "qacs": questions
        }
        return output

In [10]:
pipeline = Pipeline()
pipeline.add_component('injestor', ArcdJsonInjestor())
# pipeline.add_component('cleaner', DocumentCleaner(remove_empty_lines=True, remove_extra_whitespaces=True, remove_repeated_substrings=False))
pipeline.add_component('embedder',SentenceTransformersDocumentEmbedder(model="mhaseeb1604/bge-m3-law",token=Secret.from_token(HF_API_TOKEN)))
# pipeline.add_component('splitter', DocumentSplitter(split_by='sentence', split_length=3, split_overlap=1))
pipeline.add_component('writer', DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))

# pipeline.connect('injestor.documents', 'cleaner.documents')
# pipeline.connect('cleaner.documents', 'splitter.documents')
pipeline.connect('injestor.documents', 'embedder.documents') ## do cleaning for both outputs of injestor.
pipeline.connect('embedder.documents', 'writer.documents')



<haystack.core.pipeline.pipeline.Pipeline object at 0x7c9c3102f790>
🚅 Components
  - injestor: ArcdJsonInjestor
  - embedder: SentenceTransformersDocumentEmbedder
  - writer: DocumentWriter
🛤️ Connections
  - injestor.documents -> embedder.documents (List[Document])
  - embedder.documents -> writer.documents (List[Document])

`**did you save the vectordb before?** if no then skip`




In [11]:
import zipfile
import os
# Define paths
# saved vdb names: [vectordb-mhaseeb.zip, ]
download_path = '/content/vectordb-mhaseeb.zip'
extract_path = '/content/vectordb/'  # Where to extract
gdown.download(id="1VZZpKi68PhtZjbWwdoI12eoXTFtadLGT", output=download_path) #vectordb-mhaseeb.zip
# Create directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Unzip
with zipfile.ZipFile(download_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Documents extracted to {extract_path}")

Downloading...
From (original): https://drive.google.com/uc?id=1VZZpKi68PhtZjbWwdoI12eoXTFtadLGT
From (redirected): https://drive.google.com/uc?id=1VZZpKi68PhtZjbWwdoI12eoXTFtadLGT&confirm=t&uuid=37660861-a69e-4201-b538-143de9d75fe5
To: /content/vectordb-mhaseeb.zip
100%|██████████| 5.23M/5.23M [00:00<00:00, 29.6MB/s]


Documents extracted to /content/vectordb/


`don't run the next cell if you ran the previous one`

In [None]:
# Run pipeline
indexing_results = pipeline.run({
    'injestor': {'input_file': '/content/arcd.json'},
}, include_outputs_from = {"injestor"})

`**did you save this db before?** if yes then skip`

In [None]:
# storing vectordb in drive! check vectordb name before saving

import shutil
import os

# Define paths
doc_store_path = '/content/vectordb'  # Change this to your actual path
zip_file_name = 'vectordb-mhaseeb.zip'
drive_path = '/content/drive/MyDrive/Colab Notebooks/'  # Or a specific folder in Drive

# Zip the folder
shutil.make_archive(os.path.join(drive_path, zip_file_name.replace('.zip','')), 'zip', doc_store_path)

print(f"Zipped document store saved to Google Drive at {drive_path}{zip_file_name}")

Zipped document store saved to Google Drive at /content/drive/MyDrive/Colab Notebooks/vectordb-mhaseeb.zip


In [6]:
# re-reading document store
from haystack_integrations.document_stores.chroma import ChromaDocumentStore

document_store = ChromaDocumentStore(collection_name='arcd_evaluation', persist_path="/content/vectordb")

In [7]:
from haystack.components.builders import ChatPromptBuilder, PromptBuilder
from haystack.dataclasses import ChatMessage
from haystack.utils import Secret
from haystack.components.generators.chat import HuggingFaceAPIChatGenerator, HuggingFaceLocalChatGenerator
from haystack.components.generators import HuggingFaceLocalGenerator
from haystack.components.routers import ConditionalRouter
from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever
from haystack.components.retrievers import InMemoryEmbeddingRetriever

from haystack import component
from haystack.components.embedders import SentenceTransformersTextEmbedder


In [8]:
system_prompt = """you are arabic virtual assistant, you answer in arabic."""

main_prompt_template =system_prompt +  """
ROLE AND CONTEXT:
You are a knowledgeable assistant. Your task is to provide accurate and detailed answers to queries using the provided excerpts and references from useful resources to support your answers.

INSTRUCTIONS:
1. Identify the relevant sections of the excerpts provided.
2. If the query cannot be answered given the provided documents, return 'no_answer'
2. Otherwise provide a concise and informative response based relevant sections of the excerpts provided.
3. Ensure your responses are clear and easy to understand.

EXCERPTS:
{% for doc in documents %}
    excerpt: {{ doc.content }}
{% endfor %}

CONSIDERATIONS:
- If you can't give an answer, it's okay to output one single word 'no_answer'

Query: {{query}}
Answer:
"""

fallback_prompt_template = system_prompt + """
User entered a query that cannot be answered with the excerpts provided.
The query was: {{query}}.
Let the user know why the question cannot be answered. Be brief.
"""

In [9]:
model_name='CohereLabs/c4ai-command-r7b-arabic-02-2025'


In [12]:
query_embedder = SentenceTransformersTextEmbedder(model="mhaseeb1604/bge-m3-law",token=Secret.from_token(HF_API_TOKEN))
retriever = ChromaEmbeddingRetriever(document_store=document_store, top_k=3)
template1 = [ChatMessage.from_user(main_prompt_template)]
main_promptbuilder = ChatPromptBuilder(template=template1)
template2 = [ChatMessage.from_user(fallback_prompt_template)]
fallback_promptbuilder = ChatPromptBuilder(template=template2)
# main_llm = HuggingFaceLocalChatGenerator(model="silma-ai/SILMA-Kashif-2B-Instruct-v1.0")
main_llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen2.5-1.5B-Instruct")
# main_llm = HuggingFaceLocalChatGenerator(model="Qwen/Qwen2-1.5B-Instruct")
# main_llm = HuggingFaceLocalChatGenerator(model="google/gemma-2-2b-it")

fallback_llm = HuggingFaceAPIChatGenerator(
    api_type="serverless_inference_api",
    api_params={"model": model_name},
    token=Secret.from_token(HF_API_TOKEN))

@component
class ChatMessageToTextConverter:
    @component.output_types(replies_text=list[str])
    def run(self, replies: list[ChatMessage], **kwargs):
        # replies: list of ChatMessage objects
        texts = [msg.text for msg in replies]
        return {"replies_text": texts}

@component
class NoOpComponent:
  @component.output_types(query=str)
  def run(self, query: str, **kwargs):
    return {'query':query}

conditional_router = ConditionalRouter([
    {
        "condition": "{{'no_answer' not in replies[0] }}",
        "output": "{{replies}}",
        "output_name": "replies",
        "output_type": list[str],
    },
    {
        "condition": "{{'no_answer' in replies[0] }}",
        "output": "{{query}}",
        "output_name": "go_to_fallback",
        "output_type": str,
    },
])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [15]:
# Setup pipeline
pipeline = Pipeline()
pipeline.add_component('distributer', NoOpComponent())
pipeline.add_component('embedder', query_embedder)
pipeline.add_component('retriever', retriever)
pipeline.add_component('main_promptbuilder', main_promptbuilder)
pipeline.add_component('fallback_promptbuilder', fallback_promptbuilder)
pipeline.add_component('main_llm', main_llm)
pipeline.add_component('fallback_llm', fallback_llm)
pipeline.add_component('conditional_router', conditional_router)
pipeline.add_component('converter',ChatMessageToTextConverter())

pipeline.connect('distributer.query', 'embedder.text')
pipeline.connect('distributer.query', 'main_promptbuilder.query')
pipeline.connect('distributer.query', 'conditional_router.query')

pipeline.connect('embedder.embedding', 'retriever.query_embedding')
pipeline.connect('retriever.documents', 'main_promptbuilder.documents')
pipeline.connect('main_promptbuilder.prompt', 'main_llm.messages')

pipeline.connect('main_llm.replies', 'converter.replies')
pipeline.connect('converter.replies_text', 'conditional_router.replies')

pipeline.connect('conditional_router.go_to_fallback', 'fallback_promptbuilder.query')
pipeline.connect('fallback_promptbuilder.prompt', 'fallback_llm.messages')

<haystack.core.pipeline.pipeline.Pipeline object at 0x7fce15092e50>
🚅 Components
  - distributer: NoOpComponent
  - embedder: SentenceTransformersTextEmbedder
  - retriever: ChromaEmbeddingRetriever
  - main_promptbuilder: ChatPromptBuilder
  - fallback_promptbuilder: ChatPromptBuilder
  - main_llm: HuggingFaceLocalChatGenerator
  - fallback_llm: HuggingFaceAPIChatGenerator
  - conditional_router: ConditionalRouter
  - converter: ChatMessageToTextConverter
🛤️ Connections
  - distributer.query -> embedder.text (str)
  - distributer.query -> main_promptbuilder.query (str)
  - distributer.query -> conditional_router.query (str)
  - embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> main_promptbuilder.documents (List[Document])
  - main_promptbuilder.prompt -> main_llm.messages (List[ChatMessage])
  - fallback_promptbuilder.prompt -> fallback_llm.messages (List[ChatMessage])
  - main_llm.replies -> converter.replies (List[ChatMessage])
  - conditional_

In [33]:
import random  # Added for shuffling

def run(question: str):
  results = pipeline.run({
      'distributer': {'query': question},
      }, include_outputs_from={'retriever'})
  return results

def get_context(results):
  retriever_output = results["retriever"]['documents']
  print('number of retrieved docs: ', len(retriever_output))
  return retriever_output


def get_reply(results):
  response = results.get('conditional_router') or results.get('fallback_llm')
  print('from conditional router:', results.get('conditional_router') is not None)
  print('from fallback llm:', results.get('fallback_llm') is not None)
  reply = response['replies'][0]
  if isinstance(reply, ChatMessage):
    reply = response['replies'][0].text.replace('\n', '')
  else:
    reply = response['replies'][0].replace('\n', '')

  return reply

def get_qacs_lists(qacs, limit=-1) -> dict:
    random.shuffle(qacs)
    questions=[]
    contexts=[]
    answers=[]
    for idx, qac in enumerate(qacs):
      if idx >= limit and limit != -1:
        break
      questions.append(qac.get('question'))
      contexts.append(qac.get('ground_truth_context'))
      answers.append(qac.get('ground_truth_answer'))

    return {"questions":questions, "ground_truth_answers":answers, "ground_truth_contexts":contexts}

In [40]:
res = run('من هو ساسوكي؟')
print(get_reply(res))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

from conditional router: False
from fallback llm: True
عذرًا، لا يمكنني الإجابة على سؤالك "من هو ساسوكي؟" لأن المعلومات المقدمة لا تحتوي على معلومات كافية حول هذا الاسم. إذا كان لديك أي استفسار آخر أو تحتاج إلى معلومات حول موضوع مختلف، فلا تتردد في السؤال!


`Next step build evaluation pipeline`

In [None]:
raw_qacs = {}
try:
  raw_qacs = indexing_results['injestor']['qacs']
except:
  print('exception handled')
  # Initialize the ingestor
  ingestor = ArcdJsonInjestor()

  # Warm up the component (important before running)
  # Run the ingestor on your input (e.g., a file or folder path)
  raw_qacs = ingestor.run('/content/arcd.json')['qacs']

  # result now contains the output of the ingestor
print(raw_qacs)

exception handled
[{'question': ' - من هو جمال أحمد حمزة خاشقجي؟ ', 'ground_truth_context': Document(id=56bc54d8db5f96eb2877bac5b96498da6e900d5da551b9e2300d819e9abeb1b4, content: 'جمال أحمد حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 2018)، صحفي وإعلامي سعودي، رأس عدّ...', meta: {'title': 'جمال خاشقجي'}), 'ground_truth_answer': 'صحفي وإعلامي'}, {'question': ' - متى ولد جمال أحمد حمزة خاشقجي وتوفي؟ ال', 'ground_truth_context': Document(id=56bc54d8db5f96eb2877bac5b96498da6e900d5da551b9e2300d819e9abeb1b4, content: 'جمال أحمد حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 2018)، صحفي وإعلامي سعودي، رأس عدّ...', meta: {'title': 'جمال خاشقجي'}), 'ground_truth_answer': 'حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 2018)،'}, {'question': ' - في أي مدينة ولد جمال أحمد حمزة خاشقجي؟ ال', 'ground_truth_context': Document(id=56bc54d8db5f96eb2877bac5b96498da6e900d5da551b9e2300d819e9abeb1b4, content: 'جمال أحمد حمزة خاشقجي (13 أكتوبر 1958، المدينة المنورة - 2 أكتوبر 20

In [None]:
qacs = get_qacs_lists(raw_qacs,50)
print(len(qacs['questions']))

50


In [None]:
predicted_data = []
for q in tqdm(qacs['questions'], desc="running: "):
  results = run(q)
  predicted_context = get_context(results)
  predicted_answer = get_reply(results)
  predicted_data.append({ "predicted_answer":predicted_answer, "predicted_context":predicted_context})


running:   0%|          | 0/50 [00:00<?, ?it/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.10k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Device set to use cuda:0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

number of retrieved docs:  3


In [None]:
predicted_data[0]
predicted_data[0].keys()

dict_keys(['predicted_answer', 'predicted_context'])

`save results to json just in case!`

### Evaluation Pipeline

in this section we will evaluate the system using various metrics and results will be stored in `system_eval_results` dict.<hr>
to decide on the metrics that we are going to use. we need first to understand what we want to measure, this is why I will classify metrics to three categories:

1. `DS <-> query`: performance of the retrieval component
2. `DS <-> LLM`: performance of LLM using retrieval docs (factuality)
2. `query <-> LLM`: performance of LLM in answering user's query

**Retrieval Component Assessment:** use:
    - `Recall`: to chech of many correctly retrieved docs were retrieved.
    - `MAP`: statistical evaluation of the retrieval component.
    - `MRR`: statistical evaluation of the ranking of the results.
all need ground truth retrieved contexts, also use:
- `LLM Based Context precision with reference`: evaluates the ranking of the retriever.
- `LLM based Context Recall with reference`: evaluates the embedding used in the retriever.

**Generation Model Component Assessment:** use:
- `Faithfulness`: evaluates how much the model depended on the retrieved docs in its answer.
- `Noise Sensitivity`: evaluates how much the model is affected by the retrieved docs **whole** data in generating the answer.

**System as a Whole Assessment** use:
- `Semantic Answer Similarity`: evaluates the system response to a reference answer.

where reference refer to ground truth answer of the system.

#### eval using haystack

In [None]:
from haystack.components.evaluators import DocumentMRREvaluator, DocumentRecallEvaluator, DocumentMAPEvaluator
from haystack.components.evaluators.faithfulness import FaithfulnessEvaluator
from haystack.components.evaluators.sas_evaluator import SASEvaluator

eval_pipeline = Pipeline()
eval_pipeline.add_component("mrr_evaluator", DocumentMRREvaluator())
eval_pipeline.add_component("map_evaluator", DocumentMAPEvaluator())
eval_pipeline.add_component("recall_evaluator", DocumentRecallEvaluator())
eval_pipeline.add_component("sas_evaluator", SASEvaluator(model="sentence-transformers/all-MiniLM-L6-v2"))

In [None]:
system_eval_results = {}
ground_truth_docs = [[cd] for cd in qacs['ground_truth_contexts']]
retrieved_docs = [item['predicted_context'] for item in predicted_data]
ground_truth_answers = [answer for answer in qacs['ground_truth_answers']]
rag_answers = [item['predicted_answer'] for item in predicted_data]
questions = [question for question in qacs['questions']]

In [None]:
print(len(ground_truth_docs))
print(len(ground_truth_docs[0]))
print(len(retrieved_docs))
print(len(retrieved_docs[0]))

50
1
50
3


In [None]:
eval_results = eval_pipeline.run(
    {
        "mrr_evaluator": {"ground_truth_documents": ground_truth_docs,
	                          "retrieved_documents": retrieved_docs},
        "map_evaluator": {"ground_truth_documents": ground_truth_docs,
	                          "retrieved_documents": retrieved_docs},
        "recall_evaluator": {
            "ground_truth_documents": list(ground_truth_docs),
            "retrieved_documents": list(retrieved_docs),
        },
        "sas_evaluator": {"predicted_answers": rag_answers, "ground_truth_answers": list(ground_truth_answers)},
    }
)

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from haystack.evaluation.eval_run_result import EvaluationRunResult

inputs = {
    "question": list(questions),
    "contexts": list(ground_truth_docs),
    "answer": list(ground_truth_answers),
    "predicted_answer": rag_answers,
}

evaluation_result = EvaluationRunResult(run_name="pubmed_rag_pipeline", inputs=inputs, results=eval_results)
metrics, scores = evaluation_result.aggregated_report()['metrics'],evaluation_result.aggregated_report()['score']
haystack_eval_results = dict(zip(metrics, scores))
print(haystack_eval_results)

{'map_evaluator': 0.8066666666666668, 'mrr_evaluator': 0.8066666666666668, 'recall_evaluator': 0.9, 'sas_evaluator': np.float64(0.528661709278822)}


### save system outputs
it is not possible to run evaluation and the system pipeline in the same notebook. therefore going to seperate system pipeline and evaluation pipeline.

In [None]:
# creating an evaluation dataset
ds = []
for i in range(len(questions)):
    ds.append(
        {
            "user_input": questions[i],
            "retrieved_contexts":[d.content for d in retrieved_docs[i]],
            "response":rag_answers[i],
            "reference":ground_truth_answers[i]
        }
    )
with open('predicted_data.json','w') as jsonfile:
  json.dump(ds,jsonfile)


In [None]:
# creating an evaluation dataset
haystack_evals = {'haystack_evals':haystack_eval_results}
with open('haystack_evaluations.json','w') as jsonfile:
  json.dump(haystack_evals,jsonfile)
