In [1]:
!pip install --quiet haystack-ai chroma-haystack
!pip install --quiet --upgrade huggingface_hub
!pip install --quiet python-docx

In [2]:
!apt-get install -y tesseract-ocr
!pip install pytesseract opencv-python pillow
!apt-get install -y tesseract-ocr-ara

!pip install pdf2image
!apt-get install -y poppler-utils  # required for pdf2image

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr-ara is already the newest version (1:4.00~git30-7274cfa-1.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.9).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [3]:
from haystack import component
from haystack import Pipeline, Document
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.utils import Secret

from tqdm.notebook import tqdm

In [4]:
import os
import cv2
import numpy as np
from typing import List

from haystack import Document, component
from pdf2image import convert_from_path
import pytesseract
from PIL import Image


@component
class PdfToSingleDocumentConverter:
    """
    A component that converts multiple PDF files to Document objects using OCR.

    This component processes multiple PDF files and creates a Document object for each one.
    """

    @staticmethod
    def validate(pdf_path: str):
        """Validate that the PDF file exists and required dependencies are available."""
        if not os.path.exists(pdf_path):
            raise ValueError(f"PDF file '{pdf_path}' does not exist.")

        # Check if Tesseract is available
        try:
            pytesseract.get_tesseract_version()
        except Exception as e:
            raise RuntimeError(
                "Tesseract OCR is not properly installed or configured. "
                "Please install it and ensure it's in your PATH. "
                "For Arabic support, also install the Arabic language pack: tesseract-ocr-ara"
            ) from e

    @staticmethod
    def preprocess_image(pil_image):
        """Preprocess an image to improve OCR quality."""
        image = np.array(pil_image)
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        return thresh

    @staticmethod
    def pdf_preprocess(text: str) -> str:
        """Clean and preprocess extracted text."""
        text = text.replace('\r', '')
        text = text.split('\n\n')
        text = [i.replace('\n', ' ').strip() for i in text]
        text = [i for i in text if i != '']
        text = '\n\n'.join(text)
        return text

    @component.output_types(documents=List[Document])
    def run(self, pdf_paths: List[str], dpi: int = 300, language: str = 'ara'):
        """
        Convert multiple PDF files to Document objects.

        Args:
            pdf_paths: List of paths to PDF files to process.
            dpi: DPI for image conversion (higher = better quality but slower).
            language: Tesseract language code (default 'ara' for Arabic).

        Returns:
            A dictionary with a 'documents' key containing Document objects for all PDFs.
        """
        all_documents = []

        for pdf_path in pdf_paths:
            self.validate(pdf_path)

            # Convert PDF to images
            pages = convert_from_path(pdf_path, dpi=dpi)

            # Process each page
            extracted_texts = []
            for page in pages:
                # Preprocess image
                processed_img = self.preprocess_image(page)

                # Extract text with OCR
                pil_img = Image.fromarray(processed_img)
                text = pytesseract.image_to_string(pil_img, lang=language)
                extracted_texts.append(text)

            # Clean and process extracted text
            processed_texts = [self.pdf_preprocess(text) for text in extracted_texts]

            # Join all pages into a single text
            full_text = "\n\n".join(processed_texts)

            # Create a single Document object for the entire PDF
            doc = Document(
                content=full_text,
                meta={
                    "file_path": pdf_path,
                    "total_pages": len(processed_texts),
                    "language": language,
                    "is_full_document": True
                }
            )

            all_documents.append(doc)

        return {"documents": all_documents}

In [5]:
import os
from typing import List

from haystack import Document, component
import docx


@component
class WordToDocumentConverter:
    """
    A component that converts multiple Word documents to Document objects.
    """

    @staticmethod
    def validate(word_path: str):
        """Validate that the Word file exists and has a supported format."""
        if not os.path.exists(word_path):
            raise ValueError(f"Word file '{word_path}' does not exist.")

        if not word_path.lower().endswith(('.docx', '.doc')):
            raise ValueError("Only DOCX and DOC files are supported.")

    @component.output_types(documents=List[Document])
    def run(self, word_paths: List[str]):
        """
        Convert multiple Word documents to Document objects.

        Args:
            word_paths: List of paths to Word documents to process.

        Returns:
            A dictionary with a 'documents' key containing Document objects for all Word files.
        """
        all_documents = []

        for word_path in word_paths:
            self.validate(word_path)

            # Extract text from Word document
            document = docx.Document(word_path)
            full_text = []

            for para in document.paragraphs:
                full_text.append(para.text)

            # Process tables if needed
            for table in document.tables:
                for row in table.rows:
                    for cell in row.cells:
                        full_text.append(cell.text)

            full_text = "\n\n".join(full_text)

            # Create a Document object
            doc = Document(
                content=full_text,
                meta={
                    "file_path": word_path,
                    "file_type": "word",
                    "total_paragraphs": len(document.paragraphs),
                    "total_tables": len(document.tables)
                }
            )

            all_documents.append(doc)

        return {"documents": all_documents}

In [6]:
import os
from typing import List

from haystack import Document, component


@component
class TextFileToDocumentConverter:
    """
    A component that converts multiple text files to Document objects.
    """

    @staticmethod
    def validate(text_path: str):
        """Validate that the text file exists and has a supported format."""
        if not os.path.exists(text_path):
            raise ValueError(f"Text file '{text_path}' does not exist.")

        if not text_path.lower().endswith('.txt'):
            raise ValueError("Only TXT files are supported.")

    @component.output_types(documents=List[Document])
    def run(self, text_paths: List[str], encoding: str = 'utf-8'):
        """
        Convert multiple text files to Document objects.

        Args:
            text_paths: List of paths to text files to process.
            encoding: File encoding (default 'utf-8').

        Returns:
            A dictionary with a 'documents' key containing Document objects for all text files.
        """
        all_documents = []

        for text_path in text_paths:
            self.validate(text_path)

            # Read text from file
            try:
                with open(text_path, 'r', encoding=encoding) as file:
                    full_text = file.read()
            except Exception as e:
                raise RuntimeError(f"Failed to read text file: {str(e)}") from e

            # Create a Document object
            doc = Document(
                content=full_text,
                meta={
                    "file_path": text_path,
                    "file_type": "text",
                    "encoding": encoding,
                    "character_count": len(full_text)
                }
            )

            all_documents.append(doc)

        return {"documents": all_documents}

In [7]:
import os
from typing import List

from haystack import Document, component


@component
class FileTypeDetector:
    """
    A component that detects file types based on extensions and groups them by type.

    This component examines file extensions and groups file paths
    into categories: PDF, Word, and text files.

    Example usage:

    ```python
    detector = FileTypeDetector()
    result = detector.run(file_paths=["/path/to/doc1.pdf", "/path/to/doc2.docx"])
    # Returns with paths grouped by type
    ```
    """

    @component.output_types(pdf_paths=List[str], word_paths=List[str], text_paths=List[str])
    def run(self, file_paths: List[str]):
        """
        Group file paths by type.

        Args:
            file_paths: List of paths to files to be processed

        Returns:
            Dictionary with paths grouped by file type
        """
        pdf_paths = []
        word_paths = []
        text_paths = []

        for file_path in file_paths:
            # Validate file exists
            if not os.path.exists(file_path):
                raise ValueError(f"File not found: {file_path}")

            _, extension = os.path.splitext(file_path)
            extension = extension.lower()

            if extension == '.pdf':
                pdf_paths.append(file_path)
            elif extension in ['.docx', '.doc']:
                word_paths.append(file_path)
            elif extension == '.txt':
                text_paths.append(file_path)
            else:
                raise ValueError(f"Unsupported file format: {extension}. Supported formats are PDF, DOCX, DOC, and TXT.")

        return {
            "pdf_paths": pdf_paths,
            "word_paths": word_paths,
            "text_paths": text_paths
        }

In [8]:
from typing import List, Dict, Any, Optional

from haystack import Document, component


@component
class DocumentMerger:
    """
    A component that merges documents from multiple sources into a single list.

    This component collects documents from PDF, Word, and text converters
    and combines them into a single list for downstream processing.
    """

    @component.output_types(documents=List[Document])
    def run(self, pdf_documents: List[Document] = [],
            word_documents: List[Document] = [],
            text_documents: List[Document] = []):
        """
        Merge documents from different sources.

        Args:
            pdf_documents: Documents from PDF converter
            word_documents: Documents from Word converter
            text_documents: Documents from text converter

        Returns:
            Merged list of documents
        """
        all_documents = []

        if pdf_documents:
            all_documents.extend(pdf_documents)
        if word_documents:
            all_documents.extend(word_documents)
        if text_documents:
            all_documents.extend(text_documents)

        return {"documents": all_documents}

In [9]:
from haystack import Pipeline
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.document_stores.types import DuplicatePolicy
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

# Initialize document store (using Chroma as in your example)
document_store = ChromaDocumentStore(
    embedding_function='default',
    persist_path='/content/vectordb'
)

# Embedding model configuration
embedder_name = "sayed0am/arabic-english-bge-m3"

# Create the unified pipeline
pipeline = Pipeline()

# Add components for file type detection and conversion
pipeline.add_component('file_type_detector', FileTypeDetector())
pipeline.add_component('pdf_converter', PdfToSingleDocumentConverter())
pipeline.add_component('word_converter', WordToDocumentConverter())
pipeline.add_component('text_converter', TextFileToDocumentConverter())
pipeline.add_component('document_merger', DocumentMerger())
pipeline.add_component('embedder', SentenceTransformersDocumentEmbedder(model=embedder_name))

# Add standard processing components
pipeline.add_component('cleaner', DocumentCleaner(
    remove_empty_lines=True,
    remove_extra_whitespaces=True,
    remove_repeated_substrings=False
))
pipeline.add_component('splitter', DocumentSplitter(
    split_by='sentence',
    split_length=3,
    split_overlap=0
))
pipeline.add_component('writer', DocumentWriter(
    document_store=document_store,
    policy=DuplicatePolicy.SKIP
))

# Connect file type detector to appropriate converters
pipeline.connect('file_type_detector.pdf_paths', 'pdf_converter.pdf_paths')
pipeline.connect('file_type_detector.word_paths', 'word_converter.word_paths')
pipeline.connect('file_type_detector.text_paths', 'text_converter.text_paths')

# Connect converters to the document merger
pipeline.connect('pdf_converter.documents', 'document_merger.pdf_documents')
pipeline.connect('word_converter.documents', 'document_merger.word_documents')
pipeline.connect('text_converter.documents', 'document_merger.text_documents')

# Connect the rest of the pipeline
pipeline.connect('document_merger.documents', 'cleaner.documents')
pipeline.connect('cleaner.documents', 'splitter.documents')
pipeline.connect('splitter.documents', 'embedder.documents')
pipeline.connect('embedder.documents', 'writer.documents')

<haystack.core.pipeline.pipeline.Pipeline object at 0x7d3aacfb7e90>
🚅 Components
  - file_type_detector: FileTypeDetector
  - pdf_converter: PdfToSingleDocumentConverter
  - word_converter: WordToDocumentConverter
  - text_converter: TextFileToDocumentConverter
  - document_merger: DocumentMerger
  - embedder: SentenceTransformersDocumentEmbedder
  - cleaner: DocumentCleaner
  - splitter: DocumentSplitter
  - writer: DocumentWriter
🛤️ Connections
  - file_type_detector.pdf_paths -> pdf_converter.pdf_paths (List[str])
  - file_type_detector.word_paths -> word_converter.word_paths (List[str])
  - file_type_detector.text_paths -> text_converter.text_paths (List[str])
  - pdf_converter.documents -> document_merger.pdf_documents (List[Document])
  - word_converter.documents -> document_merger.word_documents (List[Document])
  - text_converter.documents -> document_merger.text_documents (List[Document])
  - document_merger.documents -> cleaner.documents (List[Document])
  - embedder.document

In [16]:
# Process a PDF file
paths_list = ['/content/pdddf.pdf','/content/tttext.txt']
indexing_results = pipeline.run({
    'file_type_detector': {'file_paths': paths_list},
}, include_outputs_from={'pdf_converter', 'word_converter', 'text_converter'})

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


In [11]:
indexing_results.keys()

dict_keys(['pdf_converter', 'text_converter', 'word_converter', 'writer'])

In [17]:
# storing vectordb in drive! check vectordb name before saving

import shutil
import os

# Define paths
doc_store_path = '/content/vectordb'  # Change this to your actual path
zip_file_name = 'vectordb-aren-sayed0am-testing.zip'
drive_path = '/content/'  # Or a specific folder in Drive

# Zip the folder
shutil.make_archive(os.path.join(drive_path, zip_file_name.replace('.zip','')), 'zip', doc_store_path)

print(f"Zipped document store saved to Google Drive at {drive_path}{zip_file_name}")

Zipped document store saved to Google Drive at /content/vectordb-aren-sayed0am-testing.zip
