## Imports

In [1]:
import os
import re
import json
import pickle
import numpy as np
import pandas as pd
import torch
import faiss
from abc import ABC, abstractmethod
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore import InMemoryDocstore
from langchain.llms import Ollama
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import ChatPromptTemplate


## Abstract classes

### Preprocessing class

In [2]:
from abc import ABC, abstractmethod

class BasePreprocessor(ABC):
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=200,
            chunk_overlap=50, 
            length_function=lambda x: len(x.split()),
            separators=["\n\n\n", "\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""],
            keep_separator=False,
            add_start_index=True,
            strip_whitespace=True
        )

    @abstractmethod
    def load_and_preprocess_data(self, file_path):
        pass

    @abstractmethod
    def process_documents_from_files(self, file_paths):
        pass


    def clean_text(self, text):
        return re.sub(r'\s+', ' ', re.sub(r'\n{3,}', '\n\n', str(text))).strip()



    def chunk_documents(self, individual_documents):
        chunked_docs = []
        for doc in individual_documents:
            chunks = self.text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                chunked_docs.append(
                    Document(
                        page_content=chunk,
                        metadata={
                            "pdf_id": doc.metadata["pdf_id"],
                            "chunk_id": i
                        }
                    )
                )
        print(f"✅ Total Chunks: {len(chunked_docs)}")
        return chunked_docs


In [3]:
class JSONPreprocessor(BasePreprocessor):
    def load_and_preprocess_data(self, file_path):
        with open(file_path, 'r') as f:
            raw_data = json.load(f)
        clean_texts = [self.clean_text(entry) for entry in raw_data if isinstance(entry, str)]
        return "\n".join(clean_texts)
    def process_documents_from_files(self, file_paths):
        documents = []

        for i, file_path in enumerate(file_paths):
            text = self.load_and_preprocess_data(file_path).strip()
            documents.append(
                Document(page_content=text, metadata={"pdf_id": i})
            )

        return documents


### Embeddings Abstract class

In [4]:
class Embedder(ABC): 
    def __init__(self, model_name, batch_size):
        self.model_name = model_name
        self.batch_size = batch_size
        
        self.device = (
            'cuda' if torch.cuda.is_available()
            else 'mps' if torch.backends.mps.is_available()
            else 'cpu'
        )
        self.embedding_model = HuggingFaceEmbeddings(model_name=model_name,model_kwargs={'device': self.device},encode_kwargs={'normalize_embeddings': True},multi_process=True,
                                                     show_progress=True,cache_folder='./embedder_model_cache')

    @abstractmethod
    def embed_documents(self, documents):
        pass

    @abstractmethod
    def batch_embed(self, texts, batch_size=None): 
        pass

class MultilingualEmbedder(Embedder): 
    def __init__(self, model_name, batch_size):
        super().__init__(model_name, batch_size)

    def embed_documents(self, documents):
        return self.batch_embed(documents, batch_size=self.batch_size)

    def batch_embed(self, texts, batch_size=None):
        if batch_size is None:
            batch_size = self.batch_size
        
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_embeddings = self.embedding_model.embed_documents(batch)
            embeddings.extend(batch_embeddings)
        
        return np.array(embeddings, dtype=np.float32)


### Faiss Abstract class

In [5]:
class VectorStoreBase(ABC):
    @abstractmethod
    def create_vector_store(self, documents, embedder_model):
        pass
    
    @abstractmethod
    def get_relevant_documents(self, query, top_k=5):
        pass
    
    @abstractmethod
    def save_index(self, file_path):
        pass
    
    @abstractmethod
    def load_index(self, file_path):
        pass



In [6]:
class FAISSBasic(VectorStoreBase):
    def __init__(self, embedder_model=None):
        self.index = None
        self.chunks_dict = None
        self.dimension = None
        self.total_vectors = 0
        self.index_type = "IndexFlatIP"
        self.embedder_model = embedder_model
    
    def create_vector_store(self, documents, embedder_model=None):
        """Create vector store from documents"""
        if embedder_model:
            self.embedder_model = embedder_model
        
        if not self.embedder_model:
            raise ValueError("Embedder model is required")
        
        texts = [doc.page_content for doc in documents]
        embeddings = self.embedder_model.batch_embed(texts)
        embeddings = np.array(embeddings).astype("float32")
        
        # Ensure embeddings are 2D
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)
        
        self.dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings)
        
        # Store text chunks with their indices
        self.chunks_dict = {i: text for i, text in enumerate(texts)}
        self.total_vectors = self.index.ntotal
        
        print(f"[FAISS] Created index with {self.total_vectors} vectors of dim {self.dimension}")
        return self
    
    def get_relevant_documents(self, query, top_k=5):
        """Main retriever function - returns LangChain Document objects"""
        if self.index is None:
            raise ValueError("Index not created. Call create_vector_store() first.")
        
        if not self.embedder_model:
            raise ValueError("Embedder model not set")
        
        # Get query embedding
        if isinstance(query, str):
            query_embedding = self.embedder_model.batch_embed([query])
            if isinstance(query_embedding, list) and len(query_embedding) > 0:
                query_embedding = query_embedding[0]
            elif isinstance(query_embedding, np.ndarray) and query_embedding.ndim > 1:
                query_embedding = query_embedding[0]
        else:
            query_embedding = self.embedder_model.batch_embed(query)
        
        # Search and format results
        results = self._search_chunks(query_embedding, top_k)
        
        return [
            Document(page_content=res['text'], metadata={"similarity": res['similarity']})
            for res in results
        ]
    
    def _search_chunks(self, query_embedding, top_k=5):
        """Internal search function - returns raw results"""
        # Ensure query_embedding is properly shaped
        query_embedding = np.array(query_embedding).astype("float32")
        
        # Handle different input shapes
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        elif query_embedding.ndim > 2:
            query_embedding = query_embedding.reshape(1, -1)
        
        print(f"[DEBUG] Query embedding final shape: {query_embedding.shape}")
        print(f"[DEBUG] Index dimension: {self.dimension}")
        
        # Verify dimensions match
        if query_embedding.shape[1] != self.dimension:
            raise ValueError(f"Query embedding dimension {query_embedding.shape[1]} doesn't match index dimension {self.dimension}")
        
        # Search FAISS index
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Format results
        formatted = []
        for i in range(top_k):
            faiss_idx = indices[0][i]
            if faiss_idx != -1 and faiss_idx < len(self.chunks_dict):
                distance = distances[0][i]
                formatted.append({
                    'chunk_id': faiss_idx,
                    'text': self.chunks_dict[faiss_idx],
                    'distance': distance,
                    'similarity': float(distance)  # For cosine similarity, higher is better
                })
        
        return formatted
    
    def search_raw(self, query_embedding, top_k=5):
        """Search with raw embedding input - useful for advanced use cases"""
        return self._search_chunks(query_embedding, top_k)
    
    def save_index(self, file_path):
        """Save both FAISS index and metadata"""
        if self.index is None:
            raise ValueError("No index to save")
        
        # Save FAISS index
        faiss.write_index(self.index, f"{file_path}.faiss")
        
        # Save metadata
        metadata = {
            'chunks_dict': self.chunks_dict,
            'dimension': self.dimension,
            'total_vectors': self.total_vectors,
            'index_type': self.index_type
        }
        
        with open(f"{file_path}_metadata.pkl", 'wb') as f:
            pickle.dump(metadata, f)
        
        print(f"[FAISS] Index and metadata saved to {file_path}")
    
    def load_index(self, file_path, embedder_model=None):
        """Load both FAISS index and metadata"""
        if not os.path.exists(f"{file_path}.faiss"):
            raise FileNotFoundError(f"Index file {file_path}.faiss not found")
        
        if not os.path.exists(f"{file_path}_metadata.pkl"):
            raise FileNotFoundError(f"Metadata file {file_path}_metadata.pkl not found")
        
        # Load FAISS index
        self.index = faiss.read_index(f"{file_path}.faiss")
        
        # Load metadata
        with open(f"{file_path}_metadata.pkl", 'rb') as f:
            metadata = pickle.load(f)
        
        self.chunks_dict = metadata['chunks_dict']
        self.dimension = metadata['dimension']
        self.total_vectors = metadata['total_vectors']
        self.index_type = metadata['index_type']
        
        # Set embedder model if provided
        if embedder_model:
            self.embedder_model = embedder_model
        
        print(f"[FAISS] Index loaded: {self.total_vectors} vectors, dim {self.dimension}")
        return self
    
    def set_embedder_model(self, embedder_model):
        """Set or update the embedder model"""
        self.embedder_model = embedder_model
        return self
    
    def get_stats(self):
        """Get index statistics"""
        return {
            'total_vectors': self.total_vectors,
            'dimension': self.dimension,
            'index_type': self.index_type,
            'has_embedder': self.embedder_model is not None
        }



In [7]:
class FAISSImproved(VectorStoreBase):
    def __init__(self, embedder_model=None):
        self.index = None
        self.chunks_dict = None
        self.dimension = None
        self.total_vectors = 0
        self.index_type = "IndexFlatIP"
        self.embedder_model = embedder_model
        # New attributes for enhanced functionality
        self.docstore = None
        self.index_to_docstore_id = None
        self.documents = None  # Store original Document objects
    
    def create_vector_store(self, documents, embedder_model=None):
        """Create vector store from documents"""
        if embedder_model:
            self.embedder_model = embedder_model
        
        if not self.embedder_model:
            raise ValueError("Embedder model is required")
        
        texts = [doc.page_content for doc in documents]
        embeddings = self.embedder_model.batch_embed(texts)
        embeddings = np.array(embeddings).astype("float32")
        
        # Ensure embeddings are 2D
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)
        
        self.dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings)
        
        # Store text chunks with their indices
        self.chunks_dict = {i: text for i, text in enumerate(texts)}
        self.total_vectors = self.index.ntotal
        
        print(f"[FAISS] Created index with {self.total_vectors} vectors of dim {self.dimension}")
        return self
    
    def create_vectorstore(self, docs, normalize_embeddings=True):
        """
        Create a FAISS vector store from a list of Document objects.
        Each document should have metadata like pdf_id, chunk_id, etc.
        
        Args:
            docs: List of Document objects
            normalize_embeddings: Whether to normalize embeddings for cosine similarity
        
        Returns:
            self: Returns the FAISS instance for method chaining
        """
        if not self.embedder_model:
            raise ValueError("Embedder model is required. Set it during initialization or call set_embedder_model()")
        
        # Extract texts from Document objects
        texts = [doc.page_content for doc in docs]
        
        # Generate embeddings
        embeddings = self.embedder_model.batch_embed(texts)
        embeddings = np.array(embeddings).astype("float32")
        
        # Ensure embeddings are 2D
        if embeddings.ndim == 1:
            embeddings = embeddings.reshape(1, -1)
        
        # Initialize FAISS Index
        self.dimension = embeddings.shape[1]
        self.index = faiss.IndexFlatIP(self.dimension)
        
        # Normalize embeddings for cosine similarity if requested
        if normalize_embeddings:
            faiss.normalize_L2(embeddings)
        
        self.index.add(embeddings)
        
        # Store original Document objects and create mappings
        self.documents = docs
        self.docstore = {str(i): doc for i, doc in enumerate(docs)}
        self.index_to_docstore_id = {i: str(i) for i in range(len(docs))}
        
        # Also maintain backward compatibility with chunks_dict
        self.chunks_dict = {i: doc.page_content for i, doc in enumerate(docs)}
        self.total_vectors = self.index.ntotal
        
        print(f"[FAISS] Created vectorstore with {self.total_vectors} documents of dim {self.dimension}")
        print(f"[FAISS] Normalization: {'enabled' if normalize_embeddings else 'disabled'}")
        
        return self
    
    def get_relevant_documents(self, query, top_k=5):
        """Main retriever function - returns LangChain Document objects"""
        if self.index is None:
            raise ValueError("Index not created. Call create_vector_store() or create_vectorstore() first.")
        
        if not self.embedder_model:
            raise ValueError("Embedder model not set")
        
        # Get query embedding
        if isinstance(query, str):
            # Use embed_query if available, otherwise fall back to batch_embed
            if hasattr(self.embedder_model, 'embed_query'):
                query_embedding = self.embedder_model.embed_query(query)
            else:
                query_embedding = self.embedder_model.batch_embed([query])
                if isinstance(query_embedding, list) and len(query_embedding) > 0:
                    query_embedding = query_embedding[0]
                elif isinstance(query_embedding, np.ndarray) and query_embedding.ndim > 1:
                    query_embedding = query_embedding[0]
        else:
            query_embedding = self.embedder_model.batch_embed(query)
        
        # Search and format results
        if self.docstore is not None:
            # Use enhanced docstore-based retrieval
            results = self._search_with_docstore(query_embedding, top_k)
        else:
            # Fall back to original chunk-based retrieval
            results = self._search_chunks(query_embedding, top_k)
            # Convert to Document objects for consistency
            results = [
                Document(page_content=res['text'], metadata={"similarity": res['similarity']})
                for res in results
            ]
        
        return results
    
    def _search_with_docstore(self, query_embedding, top_k=5):
        """Enhanced search function using docstore - returns Document objects"""
        # Ensure query_embedding is properly shaped
        query_embedding = np.array(query_embedding).astype("float32")
        
        # Handle different input shapes
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        elif query_embedding.ndim > 2:
            query_embedding = query_embedding.reshape(1, -1)
        
        # Verify dimensions match
        if query_embedding.shape[1] != self.dimension:
            raise ValueError(f"Query embedding dimension {query_embedding.shape[1]} doesn't match index dimension {self.dimension}")
        
        # Search FAISS index
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Format results using docstore
        documents = []
        for i in range(top_k):
            faiss_idx = indices[0][i]
            if faiss_idx != -1 and faiss_idx in self.index_to_docstore_id:
                docstore_id = self.index_to_docstore_id[faiss_idx]
                if docstore_id in self.docstore:
                    doc = self.docstore[docstore_id]
                    similarity = float(distances[0][i])
                    
                    # Create a copy of the document with updated metadata
                    enhanced_metadata = doc.metadata.copy() if doc.metadata else {}
                    enhanced_metadata["similarity"] = similarity
                    enhanced_metadata["retrieval_index"] = faiss_idx
                    
                    enhanced_doc = Document(
                        page_content=doc.page_content,
                        metadata=enhanced_metadata
                    )
                    documents.append(enhanced_doc)
        
        return documents
    
    def _search_chunks(self, query_embedding, top_k=5):
        """Internal search function - returns raw results"""
        # Ensure query_embedding is properly shaped
        query_embedding = np.array(query_embedding).astype("float32")
        
        # Handle different input shapes
        if query_embedding.ndim == 1:
            query_embedding = query_embedding.reshape(1, -1)
        elif query_embedding.ndim > 2:
            query_embedding = query_embedding.reshape(1, -1)
        
        print(f"[DEBUG] Query embedding final shape: {query_embedding.shape}")
        print(f"[DEBUG] Index dimension: {self.dimension}")
        
        # Verify dimensions match
        if query_embedding.shape[1] != self.dimension:
            raise ValueError(f"Query embedding dimension {query_embedding.shape[1]} doesn't match index dimension {self.dimension}")
        
        # Search FAISS index
        distances, indices = self.index.search(query_embedding, top_k)
        
        # Format results
        formatted = []
        for i in range(top_k):
            faiss_idx = indices[0][i]
            if faiss_idx != -1 and faiss_idx < len(self.chunks_dict):
                distance = distances[0][i]
                formatted.append({
                    'chunk_id': faiss_idx,
                    'text': self.chunks_dict[faiss_idx],
                    'distance': distance,
                    'similarity': float(distance)  # For cosine similarity, higher is better
                })
        
        return formatted
    
    def search_raw(self, query_embedding, top_k=5):
        """Search with raw embedding input - useful for advanced use cases"""
        return self._search_chunks(query_embedding, top_k)
    
    def save_index(self, file_path):
        """Save both FAISS index and metadata"""
        if self.index is None:
            raise ValueError("No index to save")
        
        # Save FAISS index
        faiss.write_index(self.index, f"{file_path}.faiss")
        
        # Save metadata (enhanced to include new attributes)
        metadata = {
            'chunks_dict': self.chunks_dict,
            'dimension': self.dimension,
            'total_vectors': self.total_vectors,
            'index_type': self.index_type,
            'docstore': self.docstore,
            'index_to_docstore_id': self.index_to_docstore_id,
            'documents': self.documents
        }
        
        with open(f"{file_path}_metadata.pkl", 'wb') as f:
            pickle.dump(metadata, f)
        
        print(f"[FAISS] Index and metadata saved to {file_path}")
    
    def load_index(self, file_path, embedder_model=None):
        """Load both FAISS index and metadata"""
        if not os.path.exists(f"{file_path}.faiss"):
            raise FileNotFoundError(f"Index file {file_path}.faiss not found")
        
        if not os.path.exists(f"{file_path}_metadata.pkl"):
            raise FileNotFoundError(f"Metadata file {file_path}_metadata.pkl not found")
        
        # Load FAISS index
        self.index = faiss.read_index(f"{file_path}.faiss")
        
        # Load metadata
        with open(f"{file_path}_metadata.pkl", 'rb') as f:
            metadata = pickle.load(f)
        
        self.chunks_dict = metadata['chunks_dict']
        self.dimension = metadata['dimension']
        self.total_vectors = metadata['total_vectors']
        self.index_type = metadata['index_type']
        
        # Load enhanced attributes if they exist (backward compatibility)
        self.docstore = metadata.get('docstore', None)
        self.index_to_docstore_id = metadata.get('index_to_docstore_id', None)
        self.documents = metadata.get('documents', None)
        
        # Set embedder model if provided
        if embedder_model:
            self.embedder_model = embedder_model
        
        print(f"[FAISS] Index loaded: {self.total_vectors} vectors, dim {self.dimension}")
        if self.docstore is not None:
            print(f"[FAISS] Enhanced docstore mode enabled")
        
        return self
    
    def set_embedder_model(self, embedder_model):
        """Set or update the embedder model"""
        self.embedder_model = embedder_model
        return self
    
    def get_stats(self):
        """Get index statistics"""
        return {
            'total_vectors': self.total_vectors,
            'dimension': self.dimension,
            'index_type': self.index_type,
            'has_embedder': self.embedder_model is not None,
            'has_docstore': self.docstore is not None,
            'has_documents': self.documents is not None
        }

# Example usage:
"""
# Initialize the FAISS vectorstore
faiss_store = FAISS(embedder_model=your_embedding_model)

# Create vectorstore from Document objects (new method)
faiss_store.create_vectorstore(document_list, normalize_embeddings=True)

# Or use the original method
faiss_store.create_vector_store(document_list, your_embedding_model)

# Search for relevant documents
results = faiss_store.get_relevant_documents("your query", top_k=5)

# Save and load
faiss_store.save_index("my_index")
faiss_store.load_index("my_index", your_embedding_model)
"""

'\n# Initialize the FAISS vectorstore\nfaiss_store = FAISS(embedder_model=your_embedding_model)\n\n# Create vectorstore from Document objects (new method)\nfaiss_store.create_vectorstore(document_list, normalize_embeddings=True)\n\n# Or use the original method\nfaiss_store.create_vector_store(document_list, your_embedding_model)\n\n# Search for relevant documents\nresults = faiss_store.get_relevant_documents("your query", top_k=5)\n\n# Save and load\nfaiss_store.save_index("my_index")\nfaiss_store.load_index("my_index", your_embedding_model)\n'

### LLM Abstract Class

In [8]:
class BaseLLM(ABC):
    def __init__(self, model_name, cache_folder):
        self.model_name = model_name
        self.cache_folder = cache_folder
        self.device = ('cpu'
            # 'cuda' if torch.cuda.is_available()
            # else 'mps' if torch.backends.mps.is_available()
            # else 'cpu'
        )

    @abstractmethod
    def load_model(self):
        pass


class OLLAMA_LLM(BaseLLM):
    def __init__(self, model_name, cache_folder):
        super().__init__(model_name, cache_folder)

    def load_model(self):
        model = Ollama(model=self.model_name, temperature=0.3, num_ctx=4096)
        return model


class Hugging_Face_LLM(BaseLLM):
    def __init__(self, model_name, cache_folder):
        super().__init__(model_name, cache_folder)

    def load_model(self):
        tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            cache_dir=self.cache_folder
        )
        model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            cache_dir=self.cache_folder,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map="auto"  
        )
        return model, tokenizer

## Strategy Pattern Design

In [9]:
class TaskStrategy(ABC):
    """Abstract base class defining the strategy interface."""
    
    @abstractmethod
    def run(self, *args, **kwargs):
        """Execute the strategy. Must be implemented by concrete strategies."""
        pass


#### Chatting Module

In [10]:

class ChattingStrategy(TaskStrategy):
    def __init__(self, llm, vector_store, embedder, top_k=5, return_sources=True):
        self.llm = llm
        self.vector_store = vector_store
        self.vector_store.set_embedder_model(embedder)
        self.top_k = top_k
        self.return_sources = return_sources
        self._build_chain()

    def format_docs(self, docs):
        return "\n\n".join(
            f"[Source {i} | PDF {doc.metadata.get('pdf_id', '?')}]: {doc.page_content}"
            for i, doc in enumerate(docs, 1)
        )

    def _build_chain(self):
        prompt_template = """You are a helpful assistant. Use the following context to answer the question.

            Context:
            {context}

            Question: {question}

            Please provide a comprehensive answer based on the context above. You MUST follow this exact format:

            RESPONSE:
            [Your main answer here]

            REASONING:
            [Explain your reasoning and how you used the context]

            SOURCES:
            [List the source numbers you referenced, for example: 1, 3, 5]
            """
        
        prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

        def retrieve_context(inputs):
            docs = self.vector_store.get_relevant_documents(inputs["question"], top_k=self.top_k)
            return self.format_docs(docs)

        self.chain = ({
                "context": RunnableLambda(retrieve_context), 
                "question": RunnablePassthrough()
            }
            | prompt
            | self.llm
            | StrOutputParser()
        )

    def parse_structured_response(self, response_text):
        cleaned_response = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL)
        cleaned_response = re.sub(r'<[^>]+>', '', cleaned_response)
        cleaned_response = re.sub(r'\n\s*\n', '\n\n', cleaned_response.strip())

        sections = {'response': '', 'reasoning': '', 'sources': ''}
        current_section = None
        current_content = []

        lines = cleaned_response.split('\n')
        for line in lines:
            line = line.strip()
            if line.upper().startswith('RESPONSE:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'response'
                current_content = [line[9:].strip()]
            elif line.upper().startswith('REASONING:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'reasoning'
                current_content = [line[10:].strip()]
            elif line.upper().startswith('SOURCES:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'sources'
                current_content = [line[8:].strip()]
            elif current_section and line:
                current_content.append(line)

        if current_section:
            sections[current_section] = '\n'.join(current_content).strip()

        source_ids = [int(x) for x in re.findall(r'\d+', sections['sources'])] if sections['sources'] else []

        return {
            'answer': sections['response'],
            'reasoning': sections['reasoning'],
            'sources': source_ids,
            'raw_response': cleaned_response
        }

    def validate_input(self, question):
        """Validate that the question is a non-empty string."""
        return isinstance(question, str) and len(question.strip()) > 0

    def run(self, question):
        """Main method to run the chain and parse result."""
        if not self.validate_input(question):
            raise ValueError("Question must be a non-empty string")
        
        response = self.chain.invoke({"question": question})

        parsed = self.parse_structured_response(response)
        print(f"Parsed response: {parsed}")  

    
        source_docs = self.vector_store.get_relevant_documents(question, top_k=self.top_k)
        parsed['source_documents'] = source_docs
        parsed['source_texts'] = [doc.page_content for doc in source_docs]
        return parsed

#### Summerization Module

In [11]:
class SummarizationStrategy(TaskStrategy):
    def __init__(self, llm):
        self.llm = llm
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", """Summarize the following document using this format:

            **Main Topic:** [One sentence describing what this document is about]

            **Key Points:**
            - [Most important point]
            - [Second most important point]  
            - [Third most important point]

            **Details:** [Supporting information, numbers, examples]

            **Conclusion:** [Main takeaway or implication]

            Document: {context}""")
                    ])

    def validate_input(self, document):
        """Validate that the document is a non-empty string."""
        return isinstance(document, str) and len(document.strip()) > 0

    def run(self, document):
        """Summarize the given document."""            
        # Format prompt manually
        formatted_prompt = self.prompt.format(context=document)
        
        # Directly invoke the LLM
        result = self.llm.invoke(formatted_prompt)
        
        print(result)
        return result


#### Question Module

In [12]:
class QuestionStrategy(TaskStrategy):
    def __init__(self, llm, complexity="medium"):
        self.llm = llm
        self.complexity = complexity
        self._set_prompt()
    
    def _set_prompt(self):
        complexity_instructions = {
            "easy": "Generate simple, basic questions that test understanding of key facts and definitions.",
            "medium": "Generate moderately challenging questions that require analysis and understanding of concepts.",
            "hard": "Generate complex questions that require critical thinking, analysis, and synthesis of information."
        }
        
        instruction = complexity_instructions.get(self.complexity, complexity_instructions["medium"])
        
        self.prompt = ChatPromptTemplate.from_template(f"""
        You are a helpful assistant tasked with generating question-answer pairs for study purposes.

        Text:
        {{context}}

        {instruction}
        Generate {{Questions}} meaningful questions based only on the above text. 

        IMPORTANT: Format your output exactly as shown below with no additional text, explanations, or formatting:

        Q1: [question text]
        Q2: [question text]
        Q3: [question text]
        """)
        self.qa_chain = self.prompt | self.llm | StrOutputParser()


    def set_complexity(self, complexity):
        """Change complexity level with synonym mapping and fuzzy matching."""
        import difflib
        
        complexity = complexity.lower().strip()
        
        # Handle synonyms first
        synonyms = {
            "challenging": "hard", "difficult": "hard", "tough": "hard",
            "simple": "easy", "basic": "easy", "beginner": "easy", 
            "moderate": "medium", "average": "medium", "normal": "medium"
        }
        
        if complexity in synonyms:
            self.complexity = synonyms[complexity]
            self._set_prompt()
            return
        
        # Check exact match
        valid_options = ["easy", "medium", "hard"]
        if complexity in valid_options:
            self.complexity = complexity
            self._set_prompt()
            return
        
        # Fuzzy matching against synonyms first
        all_options = list(synonyms.keys()) + valid_options
        matches = difflib.get_close_matches(complexity, all_options, n=1, cutoff=0.6)
        
        if matches:
            best_match = matches[0]
            similarity = difflib.SequenceMatcher(None, complexity, best_match).ratio()
            print(f"'{complexity}' matched to '{best_match}' ({similarity:.0%} confidence)")
            
            # Map to final complexity
            final_complexity = synonyms.get(best_match, best_match)
            self.complexity = final_complexity
            self._set_prompt()
        else:
            raise ValueError("Please use: 'easy', 'medium', 'hard', or synonyms like 'challenging', 'simple'")



    def parse_qa_pairs(self, qa_output):
        qa_pairs = []
        lines = qa_output.strip().split('\n')
        i = 0
        while i < len(lines):
            q_match = re.match(r'Q(\d+):\s*(.+)', lines[i])
            if q_match and i + 1 < len(lines):
                question = q_match.group(2).strip()
                a_match = re.match(f'A{q_match.group(1)}:\s*(.+)', lines[i + 1])
                if a_match:
                    answer = a_match.group(1).strip()
                    qa_pairs.append({'question': question, 'answer': answer})
                    i += 2
                else:
                    i += 1
            else:
                i += 1
        return qa_pairs

    def validate_input(self, doc):
        """Validate that the document is a Document instance with content."""
        return (isinstance(doc, Document) and 
                hasattr(doc, 'page_content') and 
                len(doc.page_content.strip()) > 0)
    
    def run(self, doc, questions, complexity='simple'):
        """Generate questions from the given document."""
        if not self.validate_input(doc):
            raise ValueError("Input must be a Document with non-empty page_content")
        
        # Update complexity if provided
        if complexity is not None:
            self.set_complexity(complexity)
            
        try:
            qa_output = self.qa_chain.invoke({"context": doc.page_content,"Questions":questions})
            parsed_qa = self.parse_qa_pairs(qa_output)
            print(qa_output)
            print(parsed_qa)

            return {
                "pdf_id": doc.metadata.get("pdf_id"),
                "chunk_id": doc.metadata.get("chunk_id"),
                "text": doc.page_content,
                "qa_output": qa_output,
                "parsed_qa": parsed_qa
            }

        except Exception as e:
            print(f"❌ QA generation failed for Document {doc.metadata}: {e}")
            return None
        


In [13]:
class TaskProcessor:
    """Context class that uses different task strategies."""
    
    def __init__(self, strategy=None):  
        self._strategy = strategy      
    
    @property
    def strategy(self):
        return self._strategy
         
    @strategy.setter
    def strategy(self, strategy):
        self._strategy = strategy
         
    def execute_task(self, *args, **kwargs):
        if self._strategy is None:      # ✅ Add this check
            raise ValueError("No strategy set")
        return self._strategy.run(*args, **kwargs)
         
    def switch_strategy(self, new_strategy):
        self.strategy = new_strategy

## Classes Testing

In [14]:
import time
# Testing cell
paths=["Market Research Report_extracted_text.json", 'PMS Market Research_extracted_text.json']
docs=JSONPreprocessor()
data=docs.process_documents_from_files(paths)
individual_documents = [ Document(page_content=pdf.page_content, metadata={"pdf_id": i})
    for i, pdf in enumerate(data) if pdf.page_content
]
chunked_docs=docs.chunk_documents(individual_documents)

✅ Total Chunks: 71


In [15]:
start=time.time()
multilingual_embedder=MultilingualEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32)
end=time.time()
print("Time Taken to process:  ", end-start)

Time Taken to process:   7.962954998016357


In [16]:
multilingual_embedder.batch_size

32

In [17]:
start=time.time()
llm=OLLAMA_LLM('llama3:8b','llm_cache').load_model()
end=time.time()
print("Time Taken to process:  ", end-start)

Time Taken to process:   0.004022121429443359


  model = Ollama(model=self.model_name, temperature=0.3, num_ctx=4096)


In [18]:
# start=time.time()
# basic_fais=FAISSBasic(multilingual_embedder)
# basic_fais.create_vector_store(chunked_docs)
# end=time.time()
# print("Time Taken to process:  ", end-start)

In [19]:
start=time.time()
fais_improved = FAISSImproved()
fais_improved.set_embedder_model(multilingual_embedder)
fais_improved.create_vector_store(chunked_docs)
end=time.time()
print("Time Taken to process:  ", end-start)

[FAISS] Created index with 71 vectors of dim 384
Time Taken to process:   58.90743017196655


#### Strategy implementation

In [20]:
processor = TaskProcessor()

In [21]:
start=time.time()
chatting_strategy = ChattingStrategy(llm, fais_improved, multilingual_embedder)
summarization_strategy = SummarizationStrategy(llm)
question_strategy = QuestionStrategy(llm)
processor = TaskProcessor()
end=time.time()
print("Time Taken to process:  ", end-start)

Time Taken to process:   0.000865936279296875


In [22]:
processor.strategy=chatting_strategy 
processor.execute_task("Which translator charges users with credits?")
end=time.time()
print("Time Taken to process:  ", end-start)

[DEBUG] Query embedding final shape: (1, 384)
[DEBUG] Index dimension: 384
Parsed response: {'answer': 'Doctranslate.io charges users with credits.', 'reasoning': 'I based my answer on the pricing plans provided in Source 1. According to the table, Doctranslate.io offers different subscription plans that charge users with translation credits. The plans include Topup -50, Topup -120, Topup -260, and Topup -750, which provide varying amounts of translation credits. This indicates that Doctranslate.io uses a credit-based system for charging its users.', 'sources': [1], 'raw_response': 'RESPONSE:\nDoctranslate.io charges users with credits.\n\nREASONING:\nI based my answer on the pricing plans provided in Source 1. According to the table, Doctranslate.io offers different subscription plans that charge users with translation credits. The plans include Topup -50, Topup -120, Topup -260, and Topup -750, which provide varying amounts of translation credits. This indicates that Doctranslate.io 

In [23]:
processor.strategy=summarization_strategy

In [24]:
print(processor.strategy)

<__main__.SummarizationStrategy object at 0x33d513880>


In [25]:
start_time = time.time()

for index, document in enumerate(individual_documents):
    doc_start_time = time.time()
    
    processor.execute_task(document)
    
    doc_end_time = time.time()
    print(f"Document[{index}] processing time: {doc_end_time - doc_start_time:.2f} seconds")

end_time = time.time()
print("Total processing time:", end_time - start_time, "seconds")


**Main Topic:** Market Research Report: Analysis of Document Translation Tools for Multilingual Document Translation

**Key Points:**

* The report evaluates leading document translation tools that support PDF, Word, Excel, and scanned images while preserving layout and formatting.
* The focus is on tools that handle Arabic, French, and English languages, catering to both B2B and B2C markets.
* The key features evaluated include layout preservation, Arabic support, OCR support, pricing model, and translation accuracy.

**Details:**

The report tested six document translation tools: Doctranslator, Doctranslate.io, TranslaDocs, SmallPDF, Doclingo, and DeepL. Each tool was subjected to a series of test cases, including text-based documents, scanned documents, tables (as text), tables (as images), and scanned documents with stamps or signatures.

The report highlights the strengths and weaknesses of each tool, including:

* Doctranslator: Offers free services with good layout preservation 

In [26]:
processor.strategy=question_strategy
print(processor.strategy)

<__main__.QuestionStrategy object at 0x33d513790>


In [27]:
start=time.time()
for index, document in enumerate(individual_documents):
    doc_start_time = time.time()
    processor.execute_task(document,20)
    doc_end_time = time.time()
    print(f"Document[{index+1}] processing time: {doc_end_time - doc_start_time:.2f} seconds")
end=time.time()
print("Time Taken to process:  ", end-start)

Here are 20 simple and basic questions that test understanding of key facts and definitions based on the provided text:

Q1: What is the main topic of this market research report?
Q2: Which languages are supported by the document translation tools evaluated in this report?
Q3: What are the key features evaluated for each tool in this report?
Q4: What is the focus of this report, and what markets does it cater to?
Q5: Which tool lacks OCR capabilities and struggles with mixed language content?
Q6: What is Doctranslator's strength regarding layout preservation?
Q7: Does Doctranslate.io support Arabic language?
Q8: What is TranslaDocs' limitation in terms of language support?
Q9: Is SmallPDF a free service?
Q10: Which tool excels in OCR performance for English but not for Arabic?
Q11: What is the recommended feature to enable users to add text, shapes, images, and freehand annotations to PDFs?
Q12: What is the purpose of the "Split PDF" feature?
Q13: Can users upload standalone images for

In [28]:
start=time.time()
for index, document in enumerate(individual_documents):
    doc_start_time = time.time()
    processor.execute_task(document,20,'challenshing')
    doc_end_time = time.time()
    print(f"Document[{index+1}] processing time: {doc_end_time - doc_start_time:.2f} seconds")
end=time.time()
print("Time Taken to process:  ", end-start)

'challenshing' matched to 'challenging' (87% confidence)
Here are the 20 complex questions based on the provided text:

Q1: What are the key features evaluated in the market research report to assess document translation tools?

Q2: Which tool lacks OCR capabilities and struggles with mixed language content?

Q3: How does Doctranslator handle Arabic numerals during translation?

Q4: What is the primary limitation of TranslaDocs, according to the report?

Q5: Can you describe the performance of Doclingo in translating complex image tables?

Q6: Why does DeepL not support Arabic OCR, despite its excellent English OCR capabilities?

Q7: How do the tools differ in their pricing models and what are the implications for users?

Q8: What is the significance of preserving layout and formatting during document translation?

Q9: Can you explain how Doctranslate.io handles mixed RTL/LTR directions during translation?

Q10: What are the recommended features to enhance document translation tools, a

In [29]:
start=time.time()
for index, document in enumerate(individual_documents):
    doc_start_time = time.time()
    processor.execute_task(document,20,'simple')
    doc_end_time = time.time()
    print(f"Document[{index+1}] processing time: {doc_end_time - doc_start_time:.2f} seconds")
end=time.time()
print("Time Taken to process:  ", end-start)

Here are 20 simple and basic questions that test understanding of key facts and definitions based on the provided text:

Q1: What is the main focus of this market research report?
Q2: Which languages are supported by the document translation tools evaluated in this report?
Q3: What are the key features evaluated for each tool in this report?
Q4: What is the purpose of OCR (Optical Character Recognition) support in document translation tools?
Q5: Which tool lacks OCR capabilities and struggles with mixed language content?
Q6: What is the main limitation of Doctranslator, according to this report?
Q7: How does Doctranslate.io handle Arabic numerals?
Q8: What is the primary advantage of Doclingo's OCR support?
Q9: Which tool does not support Arabic language and has no OCR capabilities?
Q10: What is the main limitation of DeepL's OCR support, according to this report?
Q11: How many tools were tested in this market research report?
Q12: What are the five recommended features for enhancing d