In [1]:
!pip install haystack-ai chroma-haystack
!pip install --upgrade huggingface_hub




# Setting up Data preprocessing pipeline


In [2]:
from haystack import component
from haystack import Pipeline, Document
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.document_stores.chroma import ChromaDocumentStore

## processing arabic text from pdfs


### using Tesseract OCR


In [10]:
!apt-get install -y tesseract-ocr
!pip install pytesseract opencv-python pillow
!apt-get install -y tesseract-ocr-ara

!pip install pdf2image
!apt-get install -y poppler-utils  # required for pdf2image

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr-ara is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be 

In [11]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

In [13]:
from pdf2image import convert_from_path

pdf_path = "/content/132164a.pdf"  # replace with your PDF file path
pages = convert_from_path(pdf_path, dpi=300)

In [18]:
import cv2
import numpy as np

def preprocess_image(pil_image):
    image = np.array(pil_image)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return thresh

processed_pages = [preprocess_image(page) for page in pages]


In [19]:
import pytesseract
from PIL import Image

extracted_texts = []
for img in processed_pages:
    pil_img = Image.fromarray(img)
    text = pytesseract.image_to_string(pil_img, lang='ara')
    extracted_texts.append(text)

In [21]:
full_text = "\n\n".join(extracted_texts)

with open("extracted_text.txt", "w", encoding="utf-8") as f:
    f.write(full_text)

In [27]:
pages = extracted_texts

def pdf_preprocess(text):
  text = text.replace('\r', '')
  text = text.split('\n\n')
  text = [i.replace('\n', ' ').strip() for i in text]
  text = [i for i in text if i != '']
  text = '\n\n'.join(text)
  return text
pages = [pdf_preprocess(text) for text in pages]


In [30]:
pages_documents = []
for idx, page in enumerate(pages):
    document = Document(content=page, meta={'language':'arabic'})
    pages_documents.append(document)


### creating the CromaDatastore with suitable embedding model


In [51]:
# from transformers import AutoTokenizer, AutoModel
# import torch

# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# # Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
# model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# def embedding_function(texts):
#     encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

#     # Compute token embeddings
#     with torch.no_grad():
#         model_output = model(**encoded_input)

#     # Perform pooling. In this case, max pooling.
#     sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

#     return sentence_embeddings.tolist()

document_store = ChromaDocumentStore(collection_name='altalim_aljamee', embedding_function='default', persist_path='/content/vectordb')


In [52]:
pipeline = Pipeline()
# pipeline.add_component('injestor', GutenbergTextIngestor())
pipeline.add_component('cleaner', DocumentCleaner(remove_empty_lines=True, remove_extra_whitespaces=True, remove_repeated_substrings=False))
pipeline.add_component('splitter', DocumentSplitter(split_by='sentence', split_length=3, split_overlap=1))
pipeline.add_component('writer', DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP))

# pipeline.connect('injestor.documents', 'cleaner.documents')
pipeline.connect('cleaner.documents', 'splitter.documents')
pipeline.connect('splitter.documents', 'writer.documents')



<haystack.core.pipeline.pipeline.Pipeline object at 0x7d1eb30c2ed0>
🚅 Components
  - cleaner: DocumentCleaner
  - splitter: DocumentSplitter
  - writer: DocumentWriter
🛤️ Connections
  - cleaner.documents -> splitter.documents (List[Document])
  - splitter.documents -> writer.documents (List[Document])

In [53]:
# Run pipeline
pipeline.run({
    'cleaner': {'documents': pages_documents},
})



{'writer': {'documents_written': 76}}

## RAG pipeline


In [82]:
from haystack.components.builders import ChatPromptBuilder, PromptBuilder
from haystack.dataclasses import ChatMessage
from haystack.utils import Secret
from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
from haystack.components.generators import HuggingFaceLocalGenerator
from haystack.components.routers import ConditionalRouter
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack import component

In [42]:
system_prompt = """you are arabic virtual assistant, you answer user queries in arabic."""

main_prompt_template =system_prompt +  """
ROLE AND CONTEXT:
You are a knowledgeable assistant. Your task is to provide accurate and detailed answers to queries. Use the provided excerpts and references from useful resources to support your answers.

INSTRUCTIONS:
1. Identify the relevant sections of the excerpts provided.
2. Provide a concise and informative response.
3. Ensure your responses are clear and easy to understand.

EXCERPTS:
{% for doc in documents %}
    excerpt: {{ doc.content }}
{% endfor %}

CONSIDERATIONS:
- If the query cannot be answered given the provided documents, return 'no_answer'

Query: {{query}}
Answer:
"""

fallback_prompt_template = system_prompt + """
User entered a query that cannot be answered with the excerpts provided.
The query was: {{query}}.
Let the user know why the question cannot be answered. Be brief.
"""

In [None]:
model_name='CohereLabs/aya-expanse-8b'
# HF_API_TOKEN = "--secret--"

In [94]:
retriever = ChromaQueryTextRetriever(document_store=document_store, top_k=5)
template1 = [ChatMessage.from_user(main_prompt_template)]
main_promptbuilder = ChatPromptBuilder(template=template1)
template2 = [ChatMessage.from_user(fallback_prompt_template)]
fallback_promptbuilder = ChatPromptBuilder(template=template2)
main_llm = HuggingFaceAPIChatGenerator(
    api_type="serverless_inference_api",
    api_params={"model": model_name},
    token=Secret.from_token(HF_API_TOKEN))
fallback_llm = HuggingFaceAPIChatGenerator(
    api_type="serverless_inference_api",
    api_params={"model": model_name},
    token=Secret.from_token(HF_API_TOKEN))

@component
class ChatMessageToTextConverter:
    @component.output_types(replies_text=list[str])
    def run(self, replies: list[ChatMessage], **kwargs):
        # replies: list of ChatMessage objects
        texts = [msg.text for msg in replies]
        return {"replies_text": texts}

conditional_router = ConditionalRouter([
    {
        "condition": "{{'no_answer' not in replies[0].text }}",
        "output": "{{replies}}",
        "output_name": "replies",
        "output_type": list[str],
    },
    {
        "condition": "{{'no_answer' in replies[0].text }}",
        "output": "{{query}}",
        "output_name": "go_to_fallback",
        "output_type": str,
    },
])



In [95]:
# Setup pipeline
pipeline = Pipeline()
pipeline.add_component('retriever', retriever)
pipeline.add_component('main_promptbuilder', main_promptbuilder)
pipeline.add_component('fallback_promptbuilder', fallback_promptbuilder)
pipeline.add_component('main_llm', main_llm)
pipeline.add_component('fallback_llm', fallback_llm)
pipeline.add_component('conditional_router', conditional_router)
pipeline.add_component('converter',ChatMessageToTextConverter())

pipeline.connect('retriever.documents', 'main_promptbuilder.documents')
pipeline.connect('main_promptbuilder.prompt', 'main_llm.messages')

pipeline.connect('main_llm.replies', 'converter.replies')
pipeline.connect('converter.replies_text', 'conditional_router.replies')

pipeline.connect('conditional_router.go_to_fallback', 'fallback_promptbuilder.query')
pipeline.connect('fallback_promptbuilder.prompt', 'fallback_llm.messages')

<haystack.core.pipeline.pipeline.Pipeline object at 0x7d1ea83060d0>
🚅 Components
  - retriever: ChromaQueryTextRetriever
  - main_promptbuilder: ChatPromptBuilder
  - fallback_promptbuilder: ChatPromptBuilder
  - main_llm: HuggingFaceAPIChatGenerator
  - fallback_llm: HuggingFaceAPIChatGenerator
  - conditional_router: ConditionalRouter
  - converter: ChatMessageToTextConverter
🛤️ Connections
  - retriever.documents -> main_promptbuilder.documents (List[Document])
  - main_promptbuilder.prompt -> main_llm.messages (List[ChatMessage])
  - fallback_promptbuilder.prompt -> fallback_llm.messages (List[ChatMessage])
  - main_llm.replies -> converter.replies (List[ChatMessage])
  - conditional_router.go_to_fallback -> fallback_promptbuilder.query (str)
  - converter.replies_text -> conditional_router.replies (list[str])

In [97]:
questions = [
    'ما هو التعليم الجامع؟.',
    ]


for q in questions:
  results = pipeline.run({
      'retriever': {'query': q},
      'main_promptbuilder': {'query': q},
      'conditional_router': {'query': q},
  })
  print(results)
  response = results.get('conditional_router') or results.get('fallback_llm')
  reply = response['replies'][0].replace('\n', '')
  print(f'Question: {q}')
  print(f'Response: {reply}')
  print('-------------------------------------------------\n')

{'conditional_router': {'replies': ['التعليم الجامع هو مفهوم يشير إلى توفير تعليم فعال وشامل لجميع الطلاب، بما في ذلك أولئك الذين لديهم احتياجات خاصة أو خلفيات اجتماعية واقتصادية مختلفة. ويهدف إلى ضمان تكافؤ الفرص التعليمية للجميع.\n\nوبحسب المقتطفات المقدمة، فإن تحقيق التعليم الجامع يتطلب عدة عوامل، مثل:\n- إعداد منهج دراسي شامل يلبي احتياجات جميع الطلاب.\n- توحيد إدارة التعليم العادي وتعليم ذوي الاحتياجات الخاصة.\n- تخصيص موارد مالية وبشرية وفكرية كافية، حيث أن بلدانًا عديدة قد وضعت برامج لتخصيص أموال محددة لمجالات تسد الاحتياجات الاجتماعية والاقتصادية.\n- المدارس الجامعة الفعالة التي توفر تعليمًا جيدًا لجميع تلاميذها تعتبر وسائل أنجع اقتصادياً لتوفير التعليم للجميع.']}}
Question: ما هو التعليم الجامع؟.
Response: التعليم الجامع هو مفهوم يشير إلى توفير تعليم فعال وشامل لجميع الطلاب، بما في ذلك أولئك الذين لديهم احتياجات خاصة أو خلفيات اجتماعية واقتصادية مختلفة. ويهدف إلى ضمان تكافؤ الفرص التعليمية للجميع.وبحسب المقتطفات المقدمة، فإن تحقيق التعليم الجامع يتطلب عدة عوامل، مثل:- إعداد من