## Imports

In [1]:
import json
import pandas as pd
import re
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.llms import Ollama

from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import numpy as np
import faiss
from langchain.docstore import InMemoryDocstore
import json
from langchain_community.vectorstores import FAISS
from abc import ABC, abstractmethod

## Abstract classes

### Preprocessing class

In [2]:
from abc import ABC, abstractmethod

class BasePreprocessor(ABC):
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=200,
            chunk_overlap=50, 
            length_function=lambda x: len(x.split()),
            separators=["\n\n\n", "\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""],
            keep_separator=False,
            add_start_index=True,
            strip_whitespace=True
        )

    @abstractmethod
    def load_and_preprocess_data(self, file_path):
        pass

    @abstractmethod
    def process_documents_from_files(self, file_paths):
        pass


    def clean_text(self, text):
        return re.sub(r'\s+', ' ', re.sub(r'\n{3,}', '\n\n', str(text))).strip()



    def chunk_documents(self, individual_documents):
        chunked_docs = []
        for doc in individual_documents:
            chunks = self.text_splitter.split_text(doc.page_content)
            for i, chunk in enumerate(chunks):
                chunked_docs.append(
                    Document(
                        page_content=chunk,
                        metadata={
                            "pdf_id": doc.metadata["pdf_id"],
                            "chunk_id": i
                        }
                    )
                )
        print(f"✅ Total Chunks: {len(chunked_docs)}")
        return chunked_docs


In [3]:
class JSONPreprocessor(BasePreprocessor):
    def load_and_preprocess_data(self, file_path):
        with open(file_path, 'r') as f:
            raw_data = json.load(f)
        clean_texts = [self.clean_text(entry) for entry in raw_data if isinstance(entry, str)]
        return "\n".join(clean_texts)
    def process_documents_from_files(self, file_paths):
        documents = []

        for i, file_path in enumerate(file_paths):
            text = self.load_and_preprocess_data(file_path).strip()
            documents.append(
                Document(page_content=text, metadata={"pdf_id": i})
            )

        return documents


### Embeddings Abstract class

In [4]:
class Embedder(ABC): 
    def __init__(self, model_name, batch_size):
        self.model_name = model_name
        self.batch_size = batch_size
        
        self.device = (
            'cuda' if torch.cuda.is_available()
            else 'mps' if torch.backends.mps.is_available()
            else 'cpu'
        )
        self.embedding_model = HuggingFaceEmbeddings(model_name=model_name,model_kwargs={'device': self.device},encode_kwargs={'normalize_embeddings': True},multi_process=True,
                                                     show_progress=True,cache_folder='embedder_model_cache')

    @abstractmethod
    def embed_documents(self, documents):
        pass

    @abstractmethod
    def batch_embed(self, texts, batch_size=None): 
        pass

class MultilingualEmbedder(Embedder): 
    def __init__(self, model_name, batch_size):
        super().__init__(model_name, batch_size)

    def embed_documents(self, documents):
        return self.batch_embed(documents, batch_size=self.batch_size)

    def batch_embed(self, texts, batch_size=None):
        if batch_size is None:
            batch_size = self.batch_size
        
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_embeddings = self.embedding_model.embed_documents(batch)
            embeddings.extend(batch_embeddings)
        
        return np.array(embeddings, dtype=np.float32)


### Faiss Abstract class

In [5]:
class VectorStoreBase(ABC):
    @abstractmethod
    def create_vector_store(self, documents, embedder_model):
        pass

    @abstractmethod
    def create_faiss_index(self, chunks_embed):
        pass

    @abstractmethod
    def search_faiss(self, faiss_index, index_mapping, query_embedding, top_k=5):
        pass

    @abstractmethod
    def setup_faiss_search(self, chunks, embeddings_dict):
        pass

    @abstractmethod
    def search_chunks(self, faiss_index, index_mapping, chunks_dict, query_embedding, top_k=5):
        pass

    @abstractmethod
    def save_faiss_index(self, faiss_index, file_index_name):
        pass

    @abstractmethod
    def load_faiss_index(self, file_index_name):
        pass


In [6]:
class FAISS(VectorStoreBase):
   def __init__(self):
       self.index = None
       self.index_mapping = None
       self.chunks_dict = None
       self.dimension = None
       self.index_type = "IndexFlatIP"  
       self.total_vectors = 0

   def create_vector_store(self, documents, embedder_model):
       texts = [doc.page_content for doc in documents]
       embeddings = embedder_model.batch_embed(texts)
       print(embeddings)

       embeddings = np.array(embeddings).astype("float32")
       self.dimension = embeddings.shape[1]

       self.index = faiss.IndexFlatIP(self.dimension)
       self.index.add(embeddings)

       self.chunks_dict = {i: text for i, text in enumerate(texts)}
       self.index_mapping = list(self.chunks_dict.keys())
       self.total_vectors = self.index.ntotal

       print(f"Created FAISS index with {self.index.ntotal} vectors of dimension {self.dimension}")
       return self.index, self.index_mapping, self.chunks_dict

   def create_faiss_index(self, chunks_embed):
       indices = list(chunks_embed.keys())
       embeddings = np.array([chunks_embed[idx] for idx in indices]).astype('float32')
       self.dimension = embeddings.shape[1]
       self.index = faiss.IndexFlatIP(self.dimension)
       self.index.add(embeddings)
       self.index_mapping = indices
       self.total_vectors = self.index.ntotal

       print(f"Created FAISS index with {self.index.ntotal} vectors of dimension {self.dimension}")
       return self.index, indices

   def search_faiss(self, faiss_index, index_mapping, query_embedding, top_k=5):
       query_embedding = np.array([query_embedding]).astype('float32')
       distances, indices = faiss_index.search(query_embedding, top_k)
       results = []
       for i in range(top_k):
           faiss_idx = indices[0][i]
           if faiss_idx != -1:
               your_idx = index_mapping[faiss_idx]
               distance = distances[0][i]
               results.append((your_idx, distance))
       return results

   def setup_faiss_search(self, chunks, embeddings_dict):
       self.chunks_dict = {i: chunk for i, chunk in enumerate(chunks)}
       faiss_index, index_mapping = self.create_faiss_index(embeddings_dict)
       return faiss_index, self.index_mapping, self.chunks_dict

   def search_chunks(self, faiss_index, chunks_dict, query_embedding, top_k=5):
       results = self.search_faiss(faiss_index, self.index_mapping, query_embedding, top_k)
       formatted_results = []
       for chunk_idx, distance in results:
           formatted_results.append({
               'chunk_id': chunk_idx,
               'text': chunks_dict[chunk_idx],
               'distance': distance,
               'similarity': 1 / (1 + distance)
           })
       return formatted_results

   def save_faiss_index(self, faiss_index, file_index_name):
       faiss.write_index(faiss_index, f"{file_index_name}.faiss")
       print(f"Index saved to {file_index_name}.faiss")

   def load_faiss_index(self, file_index_name):
       self.index = faiss.read_index(f"{file_index_name}.faiss")
       self.total_vectors = self.index.ntotal
       print(f"Index loaded from {file_index_name}.faiss")
       return self.index

## Classes Testing

In [7]:
# Testing cell
## Files is alist of Documents 
paths=["Market Research Report_extracted_text.json", 'PMS Market Research_extracted_text.json']
docs=JSONPreprocessor()
data=docs.process_documents_from_files(paths)
individual_documents = [ Document(page_content=pdf.page_content, metadata={"pdf_id": i})
    for i, pdf in enumerate(data) if pdf.page_content
]
chunked_docs=docs.chunk_documents(individual_documents)

✅ Total Chunks: 71


In [8]:
multilingual_embedder=MultilingualEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2", batch_size=32)

In [9]:
multilingual_embedder.batch_size

32

In [10]:

faiss_engine=FAISS()

In [11]:
index, index_mapping, chunks_dict=faiss_engine.create_vector_store(chunked_docs,multilingual_embedder)

[[-0.06576047  0.11895962 -0.04524058 ... -0.08136054  0.00303138
   0.01381639]
 [-0.04451338  0.04813152 -0.04466828 ... -0.0623714  -0.03471209
  -0.00216465]
 [ 0.00896442  0.0438443  -0.05153349 ... -0.01728812  0.035436
   0.0190528 ]
 ...
 [-0.12122848 -0.04116969 -0.02783496 ... -0.0202268   0.05930107
   0.02544023]
 [-0.12925795 -0.00389782 -0.05988017 ... -0.0438294   0.06989577
   0.0108128 ]
 [-0.01499175 -0.00426901 -0.02039175 ...  0.01145428  0.02771816
   0.02089966]]
Created FAISS index with 71 vectors of dimension 384


In [12]:
print(f"FAISS index contains {index.ntotal} vectors")
print(f"Index Mapping Length: {len(index_mapping)}")
print(f"Chunks Dict Length: {len(chunks_dict)}")



FAISS index contains 71 vectors
Index Mapping Length: 71
Chunks Dict Length: 71


In [13]:
query = "What is the document about?"
query_embedding = multilingual_embedder.batch_embed([query])[0]
query_embedding = np.array([query_embedding]).astype("float32")

D, I = index.search(query_embedding, k=3)
print("Search result indices:", I)
print("Distances:", D)

for i in I[0]:
    print(f"\nChunk {i}: {chunks_dict.get(i, '[Missing chunk]')[:300]}")


Search result indices: [[52 55  7]]
Distances: [[0.4538141  0.3971988  0.38543445]]

Chunk 52: Document Regis ter: Think of the Document Register as Aconex’s main repository . It's where all the project documents are stored, and it’s a must to upload any document here before it can be shared with others. When you upload a document, you add key details like: • Document number • Title • Revisio

Chunk 55: Mail in Aconex: The Mail module is for everyday communication on the project — answering questions, discussing ideas, and sharing quick updates. While you can attach documents to em ails, remember that these attachments don’t get uploaded to the document register. The file is simply attached for vie

Chunk 7: • Process Modes 1. Professional Translation: The AI automatically selects the optimal style and format for the translation based on the do cument’s context, audience, and purpose. For example, legal documents would adopt a formal tone with precise terminology, while marketing mater