## Imports

In [1]:
import os
import re
import json
import pickle
import numpy as np
import pandas as pd
import torch
import faiss
import time
import yaml
import re
from pathlib import Path
from abc import ABC, abstractmethod
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore import InMemoryDocstore
from langchain.llms import Ollama
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import ChatPromptTemplate


## Abstract classes

### Preprocessing class

In [2]:
from abc import ABC, abstractmethod

class BasePreprocessor(ABC):
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=200,
            chunk_overlap=50, 
            length_function=lambda x: len(x.split()),
            separators=["\n\n\n", "\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""],
            keep_separator=False,
            add_start_index=True,
            strip_whitespace=True
        )

    @abstractmethod
    def load_and_preprocess_data(self, file_path):
        pass

    @abstractmethod
    def process_documents_from_files(self, file_paths):
        pass


    def clean_text(self, text):
        return re.sub(r'\s+', ' ', re.sub(r'\n{3,}', '\n\n', str(text))).strip()



    def chunk_documents(self, individual_documents):
        chunked_docs = []
        for doc in individual_documents:
            chunks = self.text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                chunked_docs.append(
                    Document(
                        page_content=chunk,
                        metadata={
                            "pdf_id": doc.metadata["pdf_id"],
                            "chunk_id": i
                        }
                    )
                )
        print(f"✅ Total Chunks: {len(chunked_docs)}")
        return chunked_docs


In [3]:
class JSONPreprocessor(BasePreprocessor):
    def load_and_preprocess_data(self, file_path):
        with open(file_path, 'r') as f:
            raw_data = json.load(f)
        clean_texts = [self.clean_text(entry) for entry in raw_data if isinstance(entry, str)]
        return "\n".join(clean_texts)
    def process_documents_from_files(self, file_paths):
        documents = []

        for i, file_path in enumerate(file_paths):
            text = self.load_and_preprocess_data(file_path).strip()
            documents.append(
                Document(page_content=text, metadata={"pdf_id": i})
            )

        return documents


### Embeddings Abstract class

In [4]:
class Embedder(ABC): 
    def __init__(self, model_name, batch_size):
        self.model_name = model_name
        self.batch_size = batch_size
        
        self.device = (
            'cuda' if torch.cuda.is_available()
            else 'mps' if torch.backends.mps.is_available()
            else 'cpu'
        )
        self.embedding_model = HuggingFaceEmbeddings(model_name=model_name,model_kwargs={'device': self.device},encode_kwargs={'normalize_embeddings': True},multi_process=True,
                                                     show_progress=True,cache_folder='./embedder_model_cache')

    @abstractmethod
    def embed_documents(self, documents):
        pass

    @abstractmethod
    def batch_embed(self, texts, batch_size=None): 
        pass

class MultilingualEmbedder(Embedder): 
    def __init__(self, model_name, batch_size):
        super().__init__(model_name, batch_size)

    def embed_documents(self, documents):
        return self.batch_embed(documents, batch_size=self.batch_size)

    def batch_embed(self, texts, batch_size=None):
        if batch_size is None:
            batch_size = self.batch_size
        
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_embeddings = self.embedding_model.embed_documents(batch)
            embeddings.extend(batch_embeddings)
        
        return np.array(embeddings, dtype=np.float32)

### Faiss Abstract class

In [5]:
class VectorStoreBase(ABC):
    @abstractmethod
    def create_vector_store(self, documents, embedder_model):
        pass
    
    @abstractmethod
    def get_relevant_documents(self, query, top_k=5):
        pass
    
    @abstractmethod
    def save_index(self, file_path):
        pass
    
    @abstractmethod
    def load_index(self, file_path):
        pass



In [6]:
class FAISSBasic(VectorStoreBase):
    def __init__(self, embedder_model=None):
        self.index = None
        self.chunks_dict = None
        self.dimension = None
        self.total_vectors = 0
        self.index_type = "IndexFlatIP"
        self.embedder_model = embedder_model
    
    def create_vector_store(self, documents, embedder_model=None):
        """Create vector store from documents"""
        if embedder_model:
            self.embedder_model = embedder_model
        
        if not self.embedder_model:
            raise ValueError("Embedder model is required")
        
        texts = [doc.page_content for doc in documents]
        embeddings = self.embedder_model.batch_embed(texts)
        embeddings = np.array(embeddings).astype("float32")
        
        # Ensure embeddings are 2D
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)
        
        self.dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings)
        
        # Store text chunks with their indices
        self.chunks_dict = {i: text for i, text in enumerate(texts)}
        self.total_vectors = self.index.ntotal
        
        print(f"[FAISS] Created index with {self.total_vectors} vectors of dim {self.dimension}")
        return self
    
    def get_relevant_documents(self, query, top_k=5):
        """Main retriever function - returns LangChain Document objects"""
        if self.index is None:
            raise ValueError("Index not created. Call create_vector_store() first.")
        
        if not self.embedder_model:
            raise ValueError("Embedder model not set")
        
        # Get query embedding
        if isinstance(query, str):
            query_embedding = self.embedder_model.batch_embed([query])
            if isinstance(query_embedding, list) and len(query_embedding) > 0:
                query_embedding = query_embedding[0]
            elif isinstance(query_embedding, np.ndarray) and query_embedding.ndim > 1:
                query_embedding = query_embedding[0]
        else:
            query_embedding = self.embedder_model.batch_embed(query)
        
        # Search and format results
        results = self._search_chunks(query_embedding, top_k)
        
        return [
            Document(page_content=res['text'], metadata={"similarity": res['similarity']})
            for res in results
        ]
    
    def _search_chunks(self, query_embedding, top_k=5):
        """Internal search function - returns raw results"""
        # Ensure query_embedding is properly shaped
        query_embedding = np.array(query_embedding).astype("float32")
        
        # Handle different input shapes
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        elif query_embedding.ndim > 2:
            query_embedding = query_embedding.reshape(1, -1)
        
        print(f"[DEBUG] Query embedding final shape: {query_embedding.shape}")
        print(f"[DEBUG] Index dimension: {self.dimension}")
        
        # Verify dimensions match
        if query_embedding.shape[1] != self.dimension:
            raise ValueError(f"Query embedding dimension {query_embedding.shape[1]} doesn't match index dimension {self.dimension}")
        
        # Search FAISS index
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Format results
        formatted = []
        for i in range(top_k):
            faiss_idx = indices[0][i]
            if faiss_idx != -1 and faiss_idx < len(self.chunks_dict):
                distance = distances[0][i]
                formatted.append({
                    'chunk_id': faiss_idx,
                    'text': self.chunks_dict[faiss_idx],
                    'distance': distance,
                    'similarity': float(distance) 
                })
        
        return formatted
    
    def search_raw(self, query_embedding, top_k=5):
        """Search with raw embedding input - useful for advanced use cases"""
        return self._search_chunks(query_embedding, top_k)
    
    def save_index(self, file_path):
        """Save both FAISS index and metadata"""
        if self.index is None:
            raise ValueError("No index to save")
        
        # Save FAISS index
        faiss.write_index(self.index, f"{file_path}.faiss")
        
        # Save metadata
        metadata = {
            'chunks_dict': self.chunks_dict,
            'dimension': self.dimension,
            'total_vectors': self.total_vectors,
            'index_type': self.index_type
        }
        
        with open(f"{file_path}_metadata.pkl", 'wb') as f:
            pickle.dump(metadata, f)
        
        print(f"[FAISS] Index and metadata saved to {file_path}")
    
    def load_index(self, file_path, embedder_model=None):
        """Load both FAISS index and metadata"""
        if not os.path.exists(f"{file_path}.faiss"):
            raise FileNotFoundError(f"Index file {file_path}.faiss not found")
        
        if not os.path.exists(f"{file_path}_metadata.pkl"):
            raise FileNotFoundError(f"Metadata file {file_path}_metadata.pkl not found")
        
        # Load FAISS index
        self.index = faiss.read_index(f"{file_path}.faiss")
        
        # Load metadata
        with open(f"{file_path}_metadata.pkl", 'rb') as f:
            metadata = pickle.load(f)
        
        self.chunks_dict = metadata['chunks_dict']
        self.dimension = metadata['dimension']
        self.total_vectors = metadata['total_vectors']
        self.index_type = metadata['index_type']
        
        # Set embedder model if provided
        if embedder_model:
            self.embedder_model = embedder_model
        
        print(f"[FAISS] Index loaded: {self.total_vectors} vectors, dim {self.dimension}")
        return self
    
    def set_embedder_model(self, embedder_model):
        """Set or update the embedder model"""
        self.embedder_model = embedder_model
        return self
    
    def get_stats(self):
        """Get index statistics"""
        return {
            'total_vectors': self.total_vectors,
            'dimension': self.dimension,
            'index_type': self.index_type,
            'has_embedder': self.embedder_model is not None
        }



In [7]:
class FAISSImproved(VectorStoreBase):
    def __init__(self, embedder_model=None):
        self.index = None
        self.chunks_dict = None
        self.dimension = None
        self.total_vectors = 0
        self.index_type = "IndexFlatIP"
        self.embedder_model = embedder_model
        # New attributes for enhanced functionality
        self.docstore = None
        self.index_to_docstore_id = None
        self.documents = None  # Store original Document objects
    
    def create_vector_store(self, documents, embedder_model=None):
        """Create vector store from documents"""
        if embedder_model:
            self.embedder_model = embedder_model
        
        if not self.embedder_model:
            raise ValueError("Embedder model is required")
        
        texts = [doc.page_content for doc in documents]
        embeddings = self.embedder_model.batch_embed(texts)
        embeddings = np.array(embeddings).astype("float32")
        
        # Ensure embeddings are 2D
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)
        
        self.dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings)
        
        # Store text chunks with their indices
        self.chunks_dict = {i: text for i, text in enumerate(texts)}
        self.total_vectors = self.index.ntotal
        
        print(f"[FAISS] Created index with {self.total_vectors} vectors of dim {self.dimension}")
        return self
    
    def create_vectorstore(self, docs, normalize_embeddings=True):
        """
        Create a FAISS vector store from a list of Document objects.
        Each document should have metadata like pdf_id, chunk_id, etc.
        
        Args:
            docs: List of Document objects
            normalize_embeddings: Whether to normalize embeddings for cosine similarity
        
        Returns:
            self: Returns the FAISS instance for method chaining
        """
        if not self.embedder_model:
            raise ValueError("Embedder model is required. Set it during initialization or call set_embedder_model()")
        
        # Extract texts from Document objects
        texts = [doc.page_content for doc in docs]
        
        # Generate embeddings
        embeddings = self.embedder_model.batch_embed(texts)
        embeddings = np.array(embeddings).astype("float32")
        
        # Ensure embeddings are 2D
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)
        
        # Initialize FAISS Index
        self.dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(self.dimension)
        
        # Normalize embeddings for cosine similarity if requested
        if normalize_embeddings:
            faiss.normalize_L2(embeddings)
        
        self.index.add(embeddings)
        
        # Store original Document objects and create mappings
        self.documents = docs
        self.docstore = {str(i): doc for i, doc in enumerate(docs)}
        self.index_to_docstore_id = {i: str(i) for i in range(len(docs))}
        
        # Also maintain backward compatibility with chunks_dict
        self.chunks_dict = {i: doc.page_content for i, doc in enumerate(docs)}
        self.total_vectors = self.index.ntotal
        
        print(f"[FAISS] Created vectorstore with {self.total_vectors} documents of dim {self.dimension}")
        print(f"[FAISS] Normalization: {'enabled' if normalize_embeddings else 'disabled'}")
        
        return self
    
    def get_relevant_documents(self, query, top_k=5):
        """Main retriever function - returns LangChain Document objects"""
        if self.index is None:
            raise ValueError("Index not created. Call create_vector_store() or create_vectorstore() first.")
        
        if not self.embedder_model:
            raise ValueError("Embedder model not set")
        
        # Get query embedding
        if isinstance(query, str):
            # Use embed_query if available, otherwise fall back to batch_embed
            if hasattr(self.embedder_model, 'embed_query'):
                query_embedding = self.embedder_model.embed_query(query)
            else:
                query_embedding = self.embedder_model.batch_embed([query])
                if isinstance(query_embedding, list) and len(query_embedding) > 0:
                    query_embedding = query_embedding[0]
                elif isinstance(query_embedding, np.ndarray) and query_embedding.ndim > 1:
                    query_embedding = query_embedding[0]
        else:
            query_embedding = self.embedder_model.batch_embed(query)
        
        # Search and format results
        if self.docstore is not None:
            # Use enhanced docstore-based retrieval
            results = self._search_with_docstore(query_embedding, top_k)
        else:
            # Fall back to original chunk-based retrieval
            results = self._search_chunks(query_embedding, top_k)
            # Convert to Document objects for consistency
            results = [
                Document(page_content=res['text'], metadata={"similarity": res['similarity']})
                for res in results
            ]
        
        return results
    
    def _search_with_docstore(self, query_embedding, top_k=5):
        """Enhanced search function using docstore - returns Document objects"""
        # Ensure query_embedding is properly shaped
        query_embedding = np.array(query_embedding).astype("float32")
        
        # Handle different input shapes
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        elif query_embedding.ndim > 2:
            query_embedding = query_embedding.reshape(1, -1)
        
        # Verify dimensions match
        if query_embedding.shape[1] != self.dimension:
            raise ValueError(f"Query embedding dimension {query_embedding.shape[1]} doesn't match index dimension {self.dimension}")
        
        # Search FAISS index
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Format results using docstore
        documents = []
        for i in range(top_k):
            faiss_idx = indices[0][i]
            if faiss_idx != -1 and faiss_idx in self.index_to_docstore_id:
                docstore_id = self.index_to_docstore_id[faiss_idx]
                if docstore_id in self.docstore:
                    doc = self.docstore[docstore_id]
                    similarity = float(distances[0][i])
                    
                    # Create a copy of the document with updated metadata
                    enhanced_metadata = doc.metadata.copy() if doc.metadata else {}
                    enhanced_metadata["similarity"] = similarity
                    enhanced_metadata["retrieval_index"] = faiss_idx
                    
                    enhanced_doc = Document(
                        page_content=doc.page_content,
                        metadata=enhanced_metadata
                    )
                    documents.append(enhanced_doc)
        
        return documents
    
    def _search_chunks(self, query_embedding, top_k=5):
        """Internal search function - returns raw results"""
        # Ensure query_embedding is properly shaped
        query_embedding = np.array(query_embedding).astype("float32")
        
        # Handle different input shapes
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        elif query_embedding.ndim > 2:
            query_embedding = query_embedding.reshape(1, -1)
        
        print(f"[DEBUG] Query embedding final shape: {query_embedding.shape}")
        print(f"[DEBUG] Index dimension: {self.dimension}")
        
        # Verify dimensions match
        if query_embedding.shape[1] != self.dimension:
            raise ValueError(f"Query embedding dimension {query_embedding.shape[1]} doesn't match index dimension {self.dimension}")
        
        # Search FAISS index
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Format results
        formatted = []
        for i in range(top_k):
            faiss_idx = indices[0][i]
            if faiss_idx != -1 and faiss_idx < len(self.chunks_dict):
                distance = distances[0][i]
                formatted.append({
                    'chunk_id': faiss_idx,
                    'text': self.chunks_dict[faiss_idx],
                    'distance': distance,
                    'similarity': float(distance)  # For cosine similarity, higher is better
                })
        
        return formatted
    
    def search_raw(self, query_embedding, top_k=5):
        """Search with raw embedding input - useful for advanced use cases"""
        return self._search_chunks(query_embedding, top_k)
    
    def save_index(self, file_path):
        """Save both FAISS index and metadata"""
        if self.index is None:
            raise ValueError("No index to save")
        
        # Save FAISS index
        faiss.write_index(self.index, f"{file_path}.faiss")
        
        # Save metadata (enhanced to include new attributes)
        metadata = {
            'chunks_dict': self.chunks_dict,
            'dimension': self.dimension,
            'total_vectors': self.total_vectors,
            'index_type': self.index_type,
            'docstore': self.docstore,
            'index_to_docstore_id': self.index_to_docstore_id,
            'documents': self.documents
        }
        
        with open(f"{file_path}_metadata.pkl", 'wb') as f:
            pickle.dump(metadata, f)
        
        print(f"[FAISS] Index and metadata saved to {file_path}")
    
    def load_index(self, file_path, embedder_model=None):
        """Load both FAISS index and metadata"""
        if not os.path.exists(f"{file_path}.faiss"):
            raise FileNotFoundError(f"Index file {file_path}.faiss not found")
        
        if not os.path.exists(f"{file_path}_metadata.pkl"):
            raise FileNotFoundError(f"Metadata file {file_path}_metadata.pkl not found")
        
        # Load FAISS index
        self.index = faiss.read_index(f"{file_path}.faiss")
        
        # Load metadata
        with open(f"{file_path}_metadata.pkl", 'rb') as f:
            metadata = pickle.load(f)
        
        self.chunks_dict = metadata['chunks_dict']
        self.dimension = metadata['dimension']
        self.total_vectors = metadata['total_vectors']
        self.index_type = metadata['index_type']
        
        # Load enhanced attributes if they exist (backward compatibility)
        self.docstore = metadata.get('docstore', None)
        self.index_to_docstore_id = metadata.get('index_to_docstore_id', None)
        self.documents = metadata.get('documents', None)
        
        # Set embedder model if provided
        if embedder_model:
            self.embedder_model = embedder_model
        
        print(f"[FAISS] Index loaded: {self.total_vectors} vectors, dim {self.dimension}")
        if self.docstore is not None:
            print(f"[FAISS] Enhanced docstore mode enabled")
        
        return self
    
    def set_embedder_model(self, embedder_model):
        """Set or update the embedder model"""
        self.embedder_model = embedder_model
        return self
    
    def get_stats(self):
        """Get index statistics"""
        return {
            'total_vectors': self.total_vectors,
            'dimension': self.dimension,
            'index_type': self.index_type,
            'has_embedder': self.embedder_model is not None,
            'has_docstore': self.docstore is not None,
            'has_documents': self.documents is not None
        }



### LLM Abstract Class

In [8]:
class BaseLLM(ABC):
    def __init__(self, model_name, cache_folder):
        self.model_name = model_name
        self.cache_folder = cache_folder
        self.device = (
            'cuda' if torch.cuda.is_available()
            else 'mps' if torch.backends.mps.is_available()
            else 'cpu'
        )

    @abstractmethod
    def load_model(self):
        pass


class OLLAMA_LLM(BaseLLM):
    def __init__(self, model_name, cache_folder):
        super().__init__(model_name, cache_folder)

    def load_model(self):
        model = Ollama(model=self.model_name, temperature=0.3, num_ctx=4096)
        return model


class Hugging_Face_LLM(BaseLLM):
    def __init__(self, model_name, cache_folder):
        super().__init__(model_name, cache_folder)

    def load_model(self):
        tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            cache_dir=self.cache_folder
        )
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            cache_dir=self.cache_folder,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map="auto"  
        )
        return model, tokenizer

## Strategy Pattern Design

In [9]:
class TaskStrategy(ABC):
    """Abstract base class defining the strategy interface."""
    
    @abstractmethod
    def run(self, *args, **kwargs):
        """Execute the strategy. Must be implemented by concrete strategies."""
        pass


#### Chatting Module

In [10]:

class ChattingStrategy(TaskStrategy):
    def __init__(self, llm, vector_store, embedder, top_k=5, return_sources=True):
        self.llm = llm
        self.vector_store = vector_store
        self.vector_store.set_embedder_model(embedder)
        self.top_k = top_k
        self.return_sources = return_sources
        self._build_chain()

    def format_docs(self, docs):
        return "\n\n".join(
            f"[Source {i} | PDF {doc.metadata.get('pdf_id', '?')}]: {doc.page_content}"
            for i, doc in enumerate(docs, 1)
        )

    def _build_chain(self):
        prompt_template = """You are a helpful assistant. Use the following context to answer the question.

            Context:
            {context}

            Question: {question}

            Please provide a comprehensive answer based on the context above. You MUST follow this exact format:

            RESPONSE:
            [Your main answer here]

            REASONING:
            [Explain your reasoning and how you used the context]

            SOURCES:
            [List the source numbers you referenced, for example: 1, 3, 5]
            """
        
        prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

        def retrieve_context(inputs):
            docs = self.vector_store.get_relevant_documents(inputs["question"], top_k=self.top_k)
            return self.format_docs(docs)

        self.chain = ({
                "context": RunnableLambda(retrieve_context), 
                "question": RunnablePassthrough()
            }
            | prompt
            | self.llm
            | StrOutputParser()
        )

    def parse_structured_response(self, response_text):
        cleaned_response = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL)
        cleaned_response = re.sub(r'<[^>]+>', '', cleaned_response)
        cleaned_response = re.sub(r'\n\s*\n', '\n\n', cleaned_response.strip())

        sections = {'response': '', 'reasoning': '', 'sources': ''}
        current_section = None
        current_content = []

        lines = cleaned_response.split('\n')
        for line in lines:
            line = line.strip()
            if line.upper().startswith('RESPONSE:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'response'
                current_content = [line[9:].strip()]
            elif line.upper().startswith('REASONING:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'reasoning'
                current_content = [line[10:].strip()]
            elif line.upper().startswith('SOURCES:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'sources'
                current_content = [line[8:].strip()]
            elif current_section and line:
                current_content.append(line)

        if current_section:
            sections[current_section] = '\n'.join(current_content).strip()

        source_ids = [int(x) for x in re.findall(r'\d+', sections['sources'])] if sections['sources'] else []

        return {
            'answer': sections['response'],
            'reasoning': sections['reasoning'],
            'sources': source_ids,
            'raw_response': cleaned_response
        }

    def validate_input(self, question):
        """Validate that the question is a non-empty string."""
        return isinstance(question, str) and len(question.strip()) > 0

    def run(self, question):
        """Main method to run the chain and parse result."""
        if not self.validate_input(question):
            raise ValueError("Question must be a non-empty string")
        
        response = self.chain.invoke({"question": question})

        parsed = self.parse_structured_response(response)
        print(f"Parsed response: {parsed}")  

    
        source_docs = self.vector_store.get_relevant_documents(question, top_k=self.top_k)
        parsed['source_documents'] = source_docs
        parsed['source_texts'] = [doc.page_content for doc in source_docs]
        return parsed

#### Summerization Module

In [11]:
class SummarizationStrategy(TaskStrategy):
    def __init__(self, llm, template_file="/Users/maryamsaad/Documents/Document_Chatbot/config/prompts/summarization_prompts.yaml"):
        self.llm = llm
        self.load_templates(template_file)
    
    def load_templates(self, template_file):
        """Load templates from YAML file."""
        with open(template_file, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
        self.summary_templates = data.get('summary_templates', {})
        self.overview_templates = data.get('overview_templates', {})
        
    def validate_input(self, document):
        """Validate that the document is a non-empty string."""
        return isinstance(document, str) and len(document.strip()) > 0

    def run(self, document, length="medium", verbose=False, overview_level=None):
        """
        Summarize the given document with customizable length and optional reasoning,
        or create an overview at specified level and length.
        """
        if overview_level:
            return self._create_overview(document, overview_level, length, verbose)
        return self._create_summary(document, length, verbose)
    
    def _create_overview(self, document, overview_level, length, verbose=False):
        """Create an overview of the document at the specified level and length."""
        template_type = "with_reasoning" if verbose else "base"
        prompt_text = self.overview_templates[overview_level][length][template_type]
        
        prompt_template = ChatPromptTemplate.from_messages([("system", prompt_text)])
        formatted_prompt = prompt_template.format(context=document)
        
        result = self.llm.invoke(formatted_prompt)
  
        print(result)
        return result
    
    def _create_summary(self, document, length, verbose):
        """Create a regular summary of the document."""
        template_type = "with_reasoning" if verbose else "base"
        prompt_text = self.summary_templates[length][template_type]
        
        prompt_template = ChatPromptTemplate.from_messages([("system", prompt_text)])
        formatted_prompt = prompt_template.format(context=document)
        
        result = self.llm.invoke(formatted_prompt)
   
        print(result)
        return result



In [12]:
class Summarization_Rag_Strategy(TaskStrategy):
    def __init__(self, llm,retriever):
        self.llm = llm
        self.retriever=retriever
        self.prompt = PromptTemplate(
            input_variables=["user_prompt", "context"],
            template="""
            You are a helpful assistant.

            The user is interested in the topic: "{user_prompt}"

            Based on the following document excerpts, generate a structured summary.

            Only use the provided content—do not include prior knowledge or assumptions.

            == Document Excerpts ==
            {context}

            == Summary ==
            **Main Topic:** [Summarize the general theme of the retrieved content.]

            **Key Points:**
            - [Most relevant insight #1]
            - [Relevant insight #2]
            - [Relevant insight #3]

            **Supporting Details:** [Specific numbers, quotes, or facts.]

            **Conclusion:** [Key implication or recommendation.]
            """
        )
                

    def validate_input(self, documents):
        """Validate that the input is a non-empty list of Document objects."""
        return isinstance(documents, list) and all(isinstance(doc, Document) for doc in documents)

    def run(self, prompt):
        """Retrieve and summarize relevant chunks."""
        similar_chunks = self.retriever.get_relevant_documents(prompt)

        positively_correlated = [
            chunk for chunk in similar_chunks
            if chunk.metadata.get('similarity', 0) > 0.1
        ]

        if not positively_correlated:
            raise ValueError("No chunks above similarity threshold.")

        combined_text = "\n\n".join([
            f"[Chunk from page {doc.metadata.get('page', 'N/A')}]:\n{doc.page_content}" 
            for doc in positively_correlated
        ])


        formatted_prompt = self.prompt.format(user_prompt=prompt, context=combined_text)
        result = self.llm.invoke(formatted_prompt)

        print(result)
        return result

#### Question Module

In [13]:
class QuestionStrategy(TaskStrategy):
    def __init__(self, llm, complexity="medium"):
        self.llm = llm
        self.complexity = complexity
        self._set_prompt()
    
    def _set_prompt(self):
        complexity_instructions = {
            "easy": "Generate simple, basic questions that test understanding of key facts and definitions.",
            "medium": "Generate moderately challenging questions that require analysis and understanding of concepts.",
            "hard": "Generate complex questions that require critical thinking, analysis, and synthesis of information."
        }
        
        instruction = complexity_instructions.get(self.complexity, complexity_instructions["medium"])
        
        self.prompt = ChatPromptTemplate.from_template(f"""
        You are a helpful assistant tasked with generating question-answer pairs for study purposes.

        Text:
        {{context}}

        {instruction}
        Generate {{Questions}} meaningful questions based only on the above text. 

        IMPORTANT: Format your output exactly as shown below with no additional text, explanations, or formatting:

        Q1: [question text]
        Q2: [question text]
        Q3: [question text]
        """)
        self.qa_chain = self.prompt | self.llm | StrOutputParser()


    def set_complexity(self, complexity):
        """Change complexity level with synonym mapping and fuzzy matching."""
        import difflib
        
        complexity = complexity.lower().strip()
        
        # Handle synonyms first
        synonyms = {
            "challenging": "hard", "difficult": "hard", "tough": "hard",
            "simple": "easy", "basic": "easy", "beginner": "easy", 
            "moderate": "medium", "average": "medium", "normal": "medium"
        }
        
        if complexity in synonyms:
            self.complexity = synonyms[complexity]
            self._set_prompt()
            return
        
        # Check exact match
        valid_options = ["easy", "medium", "hard"]
        if complexity in valid_options:
            self.complexity = complexity
            self._set_prompt()
            return
        
        # Fuzzy matching against synonyms first
        all_options = list(synonyms.keys()) + valid_options
        matches = difflib.get_close_matches(complexity, all_options, n=1, cutoff=0.6)
        
        if matches:
            best_match = matches[0]
            similarity = difflib.SequenceMatcher(None, complexity, best_match).ratio()
            print(f"'{complexity}' matched to '{best_match}' ({similarity:.0%} confidence)")
            
            # Map to final complexity
            final_complexity = synonyms.get(best_match, best_match)
            self.complexity = final_complexity
            self._set_prompt()
        else:
            raise ValueError("Please use: 'easy', 'medium', 'hard', or synonyms like 'challenging', 'simple'")



    def parse_qa_pairs(self, qa_output):
        qa_pairs = []
        lines = qa_output.strip().split('\n')
        i = 0
        while i < len(lines):
            q_match = re.match(r'Q(\d+):\s*(.+)', lines[i])
            if q_match and i + 1 < len(lines):
                question = q_match.group(2).strip()
                a_match = re.match(f'A{q_match.group(1)}:\s*(.+)', lines[i + 1])
                if a_match:
                    answer = a_match.group(1).strip()
                    qa_pairs.append({'question': question, 'answer': answer})
                    i += 2
                else:
                    i += 1
            else:
                i += 1
        return qa_pairs

    def validate_input(self, doc):
        """Validate that the document is a Document instance with content."""
        return (isinstance(doc, Document) and 
                hasattr(doc, 'page_content') and 
                len(doc.page_content.strip()) > 0)
    
    def run(self, doc, questions, complexity='simple'):
        """Generate questions from the given document."""
        if not self.validate_input(doc):
            raise ValueError("Input must be a Document with non-empty page_content")
        
        # Update complexity if provided
        if complexity is not None:
            self.set_complexity(complexity)
            
        try:
            qa_output = self.qa_chain.invoke({"context": doc.page_content,"Questions":questions})
            parsed_qa = self.parse_qa_pairs(qa_output)
            print(qa_output)
            print(parsed_qa)

            return {
                "pdf_id": doc.metadata.get("pdf_id"),
                "chunk_id": doc.metadata.get("chunk_id"),
                "text": doc.page_content,
                "qa_output": qa_output,
                "parsed_qa": parsed_qa
            }

        except Exception as e:
            print(f"❌ QA generation failed for Document {doc.metadata}: {e}")
            return None
        


In [14]:
class TaskProcessor:
    """Context class that uses different task strategies."""
    
    def __init__(self, strategy=None):  
        self._strategy = strategy      
    
    @property
    def strategy(self):
        return self._strategy
         
    @strategy.setter
    def strategy(self, strategy):
        self._strategy = strategy
         
    def execute_task(self, *args, **kwargs):
        if self._strategy is None:      # ✅ Add this check
            raise ValueError("No strategy set")
        return self._strategy.run(*args, **kwargs)
         
    def switch_strategy(self, new_strategy):
        self.strategy = new_strategy

## Classes Testing

In [15]:
import time
# Testing cell
paths=['/Users/maryamsaad/Documents/Graduation_Proj/junk/chapter4_GT.json']
docs=JSONPreprocessor()
data=docs.process_documents_from_files(paths)
individual_documents = [ Document(page_content=pdf.page_content, metadata={"pdf_id": i})
    for i, pdf in enumerate(data) if pdf.page_content
]
chunked_docs=docs.chunk_documents(individual_documents)

✅ Total Chunks: 11


In [16]:
start=time.time()
multilingual_embedder=MultilingualEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32)
end=time.time()
print("Time Taken to process:  ", end-start)

Time Taken to process:   6.443945407867432


In [17]:
start=time.time()
llm=OLLAMA_LLM('llama3:8b','llm_cache').load_model()
end=time.time()
print("Time Taken to process:  ", end-start)

Time Taken to process:   0.0012359619140625


  model = Ollama(model=self.model_name, temperature=0.3, num_ctx=4096)


In [18]:
# start=time.time()
# basic_fais=FAISSBasic(multilingual_embedder)
# basic_fais.create_vector_store(chunked_docs)
# end=time.time()
# print("Time Taken to process:  ", end-start)

In [19]:
start=time.time()
fais_improved = FAISSImproved()
fais_improved.set_embedder_model(multilingual_embedder)
fais_improved.create_vector_store(chunked_docs)
end=time.time()
print("Time Taken to process:  ", end-start)

[FAISS] Created index with 11 vectors of dim 384
Time Taken to process:   20.425259113311768


#### Strategy implementation

In [20]:
start=time.time()
chatting_strategy = ChattingStrategy(llm, fais_improved, multilingual_embedder)
summarization_strategy = SummarizationStrategy(llm)
question_strategy = QuestionStrategy(llm)
rag_summary=Summarization_Rag_Strategy(llm,fais_improved)
processor = TaskProcessor()
end=time.time()
print("Time Taken to process:  ", end-start)

Time Taken to process:   0.012579917907714844


##### Arabic test

In [21]:
processor.strategy=chatting_strategy
processor.execute_task("ما موضوع هذا الملف")

[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384
Parsed response: {'answer': 'The subject of this file is psychological and social harmony, coping mechanisms, and defense strategies.', 'reasoning': 'Based on the provided context, it appears that the main topic is related to psychology and sociology. The text mentions concepts such as "توافق" (harmony), "إحباط" (frustration), "صراع" (conflict), and various coping mechanisms like "الإعلان" (assertion), "التعويض" (compensation), "الإسقاط" (denial), " التقصص" (identification), and "التكوص" (regression). The context also touches upon the importance of social harmony, emotional regulation, and defense strategies.', 'sources': [1, 2, 3, 4], 'raw_response': 'RESPONSE:\nThe subject of this file is psychological and social harmony, coping mechanisms, and defense strategies.\n\nREASONING:\nBased on the provided context, it appears that the main topic is related to psychology and sociology. The text mentions concepts such

Process SpawnProcess-11:
Process SpawnProcess-10:
Process SpawnProcess-9:
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
  File "/Users/maryamsaad/Library/Python/3.9/lib/python/site-packages/sentence_transformers/__init__.py", line 10, in <module>
    from sentence_transformers.backend import (
  File "/Users/maryamsaad/Library/Python/3.9/lib/python/site-packages/sentence_transformers/backend.py", line 11, in <module>
    from sentence_transformers.util import disable_datasets_caching, is_datasets_available
  File "/Users/maryamsaad/Library/Python/3.9/lib/python/sit

KeyboardInterrupt: 

libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe
libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe
libc++abi: terminating due to uncaught exception of type std::__1::system_error: Broken pipe


In [None]:
processor.strategy=summarization_strategy 
processor.execute_task(individual_documents[0].page_content,length="short",verbose=True,overview_level='low_level')

Here is a brief low-level overview of the document:

**Key Details:**

* The most critical technical detail is the discussion on conflict resolution strategies, including direct and indirect methods.
* The most important data point is the various ways people cope with stress and conflict.

**Action Items:**

* Understand the different approaches to conflict resolution, including direct and indirect methods.
* Recognize the importance of coping mechanisms in dealing with stress and conflict.

The document discusses various aspects of personality, conflict, and coping mechanisms. It covers topics such as:

* Conflict resolution strategies (direct and indirect)
* Coping mechanisms (e.g., suppression, denial, rationalization)
* Personality theories (e.g., Freudian, behavioral)
* The role of social factors in shaping behavior
* The importance of understanding human emotions and behaviors

The document is written in a formal and academic tone, with a focus on providing an overview of the con

'Here is a brief low-level overview of the document:\n\n**Key Details:**\n\n* The most critical technical detail is the discussion on conflict resolution strategies, including direct and indirect methods.\n* The most important data point is the various ways people cope with stress and conflict.\n\n**Action Items:**\n\n* Understand the different approaches to conflict resolution, including direct and indirect methods.\n* Recognize the importance of coping mechanisms in dealing with stress and conflict.\n\nThe document discusses various aspects of personality, conflict, and coping mechanisms. It covers topics such as:\n\n* Conflict resolution strategies (direct and indirect)\n* Coping mechanisms (e.g., suppression, denial, rationalization)\n* Personality theories (e.g., Freudian, behavioral)\n* The role of social factors in shaping behavior\n* The importance of understanding human emotions and behaviors\n\nThe document is written in a formal and academic tone, with a focus on providing a

In [None]:
processor.strategy=rag_summary
processor.execute_task("حقوق")

[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384
**Main Topic:** Rights and Conflict Resolution Strategies

**Key Points:**

* The importance of recognizing the difference between frustration and conflict
* The need to identify and address underlying causes of frustration and conflict
* The use of various strategies for resolving conflicts, including:
	+ Direct methods (e.g., replacing a goal with another one, using problem-solving skills)
	+ Indirect methods (e.g., suppression, denial, projection)

**Supporting Details:**

* Examples of direct methods include:
	+ Replacing a goal with another one
	+ Using problem-solving skills to overcome obstacles
	+ Seeking alternative solutions
* Examples of indirect methods include:
	+ Suppression: repressing or hiding one's emotions or desires
	+ Denial: refusing to acknowledge the existence of a problem or conflict
	+ Projection: attributing one's own thoughts, feelings, or motivations to someone else

**Conclusion:** 

"**Main Topic:** Rights and Conflict Resolution Strategies\n\n**Key Points:**\n\n* The importance of recognizing the difference between frustration and conflict\n* The need to identify and address underlying causes of frustration and conflict\n* The use of various strategies for resolving conflicts, including:\n\t+ Direct methods (e.g., replacing a goal with another one, using problem-solving skills)\n\t+ Indirect methods (e.g., suppression, denial, projection)\n\n**Supporting Details:**\n\n* Examples of direct methods include:\n\t+ Replacing a goal with another one\n\t+ Using problem-solving skills to overcome obstacles\n\t+ Seeking alternative solutions\n* Examples of indirect methods include:\n\t+ Suppression: repressing or hiding one's emotions or desires\n\t+ Denial: refusing to acknowledge the existence of a problem or conflict\n\t+ Projection: attributing one's own thoughts, feelings, or motivations to someone else\n\n**Conclusion:** Effective conflict resolution requires a comb

In [None]:
processor.strategy=question_strategy
processor.execute_task(individual_documents[0],10,complexity='simple')
# doc, questions, complexity='simple'

Q1: What are the direct methods to overcome conflicts and frustrations?

Q2: What is the concept of "replacement" in psychology?

Q3: What is the difference between psychological and social compatibility?

Q4: What is the definition of frustration according to the text?

Q5: What are some examples of defensive mechanisms mentioned in the text?

Q6: What is the purpose of studying personality types?

Q7: Can a person's personality be determined by their physical characteristics? Why or why not?

Q8: What is the difference between psychological and social compatibility?

Q9: What are some consequences of frustration and conflict?

Q10: What are some examples of direct methods to overcome conflicts and frustrations mentioned in the text?
[]


{'pdf_id': 0,
 'chunk_id': None,
 'text': 'الفصل الرابعأساليب حل الصراعات ومواجهة الاحتياطات-اولًا: الأساليب المباشرةيستطيع معظم الناس التغلب على مواقف الإحباط والصراع وما ينشأ عنها من عدم توافق باللجوء إلى الأساليب المباشرة ، والتي تستخدم في حل مشكلات التوافق حلا حاسماً وبنهائياً، وأهم هذه الطرق ما يأتي: أ- بدل الجهد لإزالة العائق والوصول إلى الهدف : إن أول طريق مباشرة للتغلب على مواقف الإحباط والصراع، وما يتضمنه من عوائق تحول دون إشباع الدواقع أو الوصول إلى الأهداف هي القيام بعمل جديّ، ومضاوعة الجهد لإزالة هذه العوائق، فالطالب الذي يرسبِّ فَى الإمتحان يحاول أن يزيد من مجهوده فى استتذكار دروسه حتى ينجح فى الامتحان عند إعادته ويتفوق فيه .– البحث عن طرق أخرى للوصول الى الهدف : إذا وجد الشخص أن الطريقة التي يستخدمها للوصول إلى الهدف لا تؤدى لى ذلك بالرغم مما يبدوُله من جهد ونشاط؛ فإنه بيبدأ في البحث عن طريقة أخرى تؤدى لى ذلك ، فالطالب فى المثال السابق قد يلجاْ إلى تغيير عاداته فى الاستتذكار ، وهذه الطريقة لا تصلح إلا إذ كان العائق خارجياً أو كان ناجماً عن عوبش شخصية يمكن تقادييها وعلاجها

In [None]:
# processor.strategy=chatting_strategy 
# processor.execute_task("Which translator charges users with credits?")
# end=time.time()
# print("Time Taken to process:  ", end-start)

In [None]:
fais_improved.get_relevant_documents("Editiors")

[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384


[Document(metadata={'similarity': 0.28096646070480347}, page_content='This feature enables users to compare, review, and edit translations in real time, enhancing accuracy and usability for both professional and casual users. These recommended features address critical gaps in current tools, such as limited editing capabilit ies, lack of flexible OCR options, insufficient support for domain -specific or tone - adjusted translations, absence of advanced translation modes, and the need for intuitive review interfaces. Implementing these enhancements could significantly improve the functionality and market competitiveness of document translation tools.'),
 Document(metadata={'similarity': 0.19255179166793823}, page_content='• 5 editable file translations per user/month in tota l. • Upload files up to 10 MB • Tone /informal tone is available . Pro Advanced $28.74/user/ month (Paid annually ) For individuals & teams • 20 files/month . • 5 editable file translations per user/month in tota l.

In [None]:
print(processor.strategy)

None


In [None]:
processor.strategy=rag_summary
summary=processor.execute_task("Ai enhancments")

[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384
**Main Topic:** AI Enhancements for PDF Translation and Management

**Key Points:**

* The document highlights various AI-powered features for enhancing PDF translation and management, including LLMs, automated summarization, question generation, tone customization, and domain-specific translation.
* The features are designed to cater to different user needs and budgets, with options for professional translation, paraphrasing, and split-view translation interfaces.
* The tools tested include Doctranslator, Doctranslate.io, TranslaDocs, SmallPDF, Doclingo, and DeepL, which demonstrated varying performance across key criteria such as OCR support, language support, and translation accuracy.

**Supporting Details:**

* Some AI-powered features include:
	+ LLMs for categorizing subscription plans to accommodate user needs and budgets
	+ Automated summarization of PDF content for quick grasping of key points
	+ Questi

In [None]:
print(processor.strategy)
summary=processor.execute_task("use cases ")

<__main__.Summarization_Rag_Strategy object at 0x37f09cf40>
[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384
**Main Topic:** Use Cases for Translation Tools

**Key Points:**

* The effectiveness of translation tools varies across different criteria such as OCR support, language support, and performance.
* Some tools excel in specific areas, while others struggle with complex image tables or lack of Arabic OCR support.
* Pricing models differ widely, catering to various user segments from cost-conscious individuals to enterprises.

**Supporting Details:**

* The tested tools include Doctranslator, Doctranslate.io, TranslaDocs, SmallPDF, Doclingo, and DeepL.
* Key features critical to the target audience's needs were evaluated across Arabic, French, and English languages.
* The results comparison table highlights the strengths and weaknesses of each tool in terms of layout preservation, OCR support, pricing, and notes on specific issues.

**Conclusion:**

The us

In [None]:
processor.execute_task("Executaive summary")

[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384
**Main Topic:** Executive Summary of Document Translation Tools

**Key Points:**

* The AI-powered document translation tool offers various process modes, including professional translation, paraphrasing, and split-view translation interface.
* The tool supports bi-directional conversion between Word, PowerPoint, Excel, and PDF formats, as well as image translation and selective OCR activation.
* Additional features include AI-powered summarization, question generation, tone customization, and domain-specific translation.

**Supporting Details:**

* The tool's professional translation mode can adopt different styles and formats based on the document's context, audience, and purpose.
* The split-view translation interface allows users to compare, review, and edit translations in real-time.
* The tool's conversion features enable seamless transitions between document types while preserving formatting and content i

"**Main Topic:** Executive Summary of Document Translation Tools\n\n**Key Points:**\n\n* The AI-powered document translation tool offers various process modes, including professional translation, paraphrasing, and split-view translation interface.\n* The tool supports bi-directional conversion between Word, PowerPoint, Excel, and PDF formats, as well as image translation and selective OCR activation.\n* Additional features include AI-powered summarization, question generation, tone customization, and domain-specific translation.\n\n**Supporting Details:**\n\n* The tool's professional translation mode can adopt different styles and formats based on the document's context, audience, and purpose.\n* The split-view translation interface allows users to compare, review, and edit translations in real-time.\n* The tool's conversion features enable seamless transitions between document types while preserving formatting and content integrity.\n\n**Conclusion:** This AI-powered document translat

In [None]:
processor.execute_task("Translation tools")

[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384
**Main Topic:** Mention Translation tools

**Key Points:**

* The current document translation tools have limitations and gaps in their features, such as limited editing capabilities, lack of flexible OCR options, insufficient support for domain-specific translations, and absence of advanced translation modes.
* To improve the functionality and market competitiveness of document translation tools, several recommended features can be added, including PDF editing, PDF annotations, split PDF, process modes (professional translation and paraphrase), and a split-view translation interface.

**Supporting Details:**

* The analysis reveals that no single tool fully meets all requirements for accurate, efficient, and cost-effective document translation across Arabic, French, and English.
* Some tools have limitations in their OCR capabilities, layout preservation, and handling of mixed language content.
* The recommende

'**Main Topic:** Mention Translation tools\n\n**Key Points:**\n\n* The current document translation tools have limitations and gaps in their features, such as limited editing capabilities, lack of flexible OCR options, insufficient support for domain-specific translations, and absence of advanced translation modes.\n* To improve the functionality and market competitiveness of document translation tools, several recommended features can be added, including PDF editing, PDF annotations, split PDF, process modes (professional translation and paraphrase), and a split-view translation interface.\n\n**Supporting Details:**\n\n* The analysis reveals that no single tool fully meets all requirements for accurate, efficient, and cost-effective document translation across Arabic, French, and English.\n* Some tools have limitations in their OCR capabilities, layout preservation, and handling of mixed language content.\n* The recommended features aim to address the gaps identified in the tested too

In [None]:
processor.strategy=chatting_strategy
processor.execute_task("Translation platforms and their examples")


[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384
Parsed response: {'answer': 'Translation platforms that can be considered based on the provided context are Doctranslator, Doctranslate.io, TranslaDocs, SmallPDF, Doclingo, and DeepL. These platforms offer various features such as translation, OCR, and editing capabilities.', 'reasoning': 'The context provides information about different document translation tools and their limitations. It highlights the need for a platform that can accurately translate documents in real-time, preserve layout, and handle OCR for scanned documents. The recommended features mentioned aim to address these gaps and improve functionality, user experience, and translation quality. By considering these platforms and their examples, we can identify the strengths and weaknesses of each tool.', 'sources': [1, 3, 4, 5], 'raw_response': 'RESPONSE:\nTranslation platforms that can be considered based on the provided context are Doctranslator,

{'answer': 'Translation platforms that can be considered based on the provided context are Doctranslator, Doctranslate.io, TranslaDocs, SmallPDF, Doclingo, and DeepL. These platforms offer various features such as translation, OCR, and editing capabilities.',
 'reasoning': 'The context provides information about different document translation tools and their limitations. It highlights the need for a platform that can accurately translate documents in real-time, preserve layout, and handle OCR for scanned documents. The recommended features mentioned aim to address these gaps and improve functionality, user experience, and translation quality. By considering these platforms and their examples, we can identify the strengths and weaknesses of each tool.',
 'sources': [1, 3, 4, 5],
 'raw_response': 'RESPONSE:\nTranslation platforms that can be considered based on the provided context are Doctranslator, Doctranslate.io, TranslaDocs, SmallPDF, Doclingo, and DeepL. These platforms offer vario

In [None]:
start_time = time.time()

for index, document in enumerate(individual_documents):
    doc_start_time = time.time()
    
    processor.execute_task(document,length='long',verbose=True)
    
    doc_end_time = time.time()
    print(f"Document[{index}] processing time: {doc_end_time - doc_start_time:.2f} seconds")

end_time = time.time()
print("Total processing time:", end_time - start_time, "seconds")

TypeError: run() got an unexpected keyword argument 'length'

In [None]:
processor.strategy=question_strategy
print(processor.strategy)

<__main__.QuestionStrategy object at 0x380610d30>


In [None]:
start=time.time()
for index, document in enumerate(individual_documents):
    doc_start_time = time.time()
    processor.execute_task(document,20)
    doc_end_time = time.time()
    print(f"Document[{index+1}] processing time: {doc_end_time - doc_start_time:.2f} seconds")
end=time.time()
print("Time Taken to process:  ", end-start)

Here are the 20 simple questions that test understanding of key facts and definitions based on the provided text:

Q1: What is the primary focus of this market research report?
Q2: Which languages are supported by the document translation tools evaluated in this report?
Q3: What are the five test cases used to evaluate the document translation tools?
Q4: Which tool lacks OCR capabilities?
Q5: What is the main issue with Doctranslator's handling of mixed language content?
Q6: Which tool provides OCR functionality but has slow processing times?
Q7: What feature does TranslaDocs lack?
Q8: Which tool offers limited translation support and no OCR capabilities?
Q9: What is the primary limitation of Doclingo's performance in English to Arabic translations?
Q10: Which tool excels in English OCR but does not support Arabic OCR?
Q11: What are the three recommended editing features for document translation tools?
Q12: What is the purpose of the "Split PDF" feature?
Q13: Which tool provides a spli

In [None]:
start=time.time()
for index, document in enumerate(individual_documents):
    doc_start_time = time.time()
    processor.execute_task(document,20,'challenshing')
    doc_end_time = time.time()
    print(f"Document[{index+1}] processing time: {doc_end_time - doc_start_time:.2f} seconds")
end=time.time()
print("Time Taken to process:  ", end-start)

'challenshing' matched to 'challenging' (87% confidence)
Here are the 20 complex questions based on the provided text:

Q1: What are the key features evaluated in this market research report to assess document translation tools?

Q2: Which tool offers free services with good layout preservation for English to Arabic translations and effective handling of text directionality?

Q3: How does Doctranslator handle mixed language content, and what limitations does it have?

Q4: What are the test cases conducted for each language in this report, and why are they important?

Q5: Which tool provides OCR functionality but suffers from slow processing times and poor layout preservation?

Q6: What is the main limitation of TranslaDocs, according to this report?

Q7: How does SmallPDF handle translation support, and what are its limitations?

Q8: What are the recommended features for enhancing document translation tools, as suggested in this report?

Q9: How can AI-powered features improve the func

In [None]:
start=time.time()
for index, document in enumerate(individual_documents):
    doc_start_time = time.time()
    processor.execute_task(document,20,'simple')
    doc_end_time = time.time()
    print(f"Document[{index+1}] processing time: {doc_end_time - doc_start_time:.2f} seconds")
end=time.time()
print("Time Taken to process:  ", end-start)

Here are 20 simple and basic questions that test understanding of key facts and definitions based on the provided text:

Q1: What is the main focus of this market research report?
Q2: Which languages are supported by the document translation tools evaluated in this report?
Q3: What are the key features evaluated in this report?
Q4: What is OCR (Optical Character Recognition) support, and which tool lacks it?
Q5: Which tool offers free services with good layout preservation for English to Arabic translations?
Q6: What is the main limitation of Doctranslator's OCR capabilities?
Q7: Which tool provides OCR functionality but suffers from slow processing times?
Q8: What is the primary issue with TranslaDocs' translation accuracy?
Q9: Which tool does not support Arabic language or OCR?
Q10: What is the pricing model for SmallPDF?
Q11: Which tool supports OCR with good performance for Arabic to English scanned documents?
Q12: What is the main limitation of Doclingo's OCR capabilities?
Q13: Wh