# RAG Pipeline

## Imports and read DF

In [1]:
import pandas as pd
import numpy as np
import json
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
import re
from sentence_transformers import SentenceTransformer
import torch
import faiss
import ollama



In [2]:
file=json.load(open('Market Research Report_extracted_text.json'))
file
print(len(file))
df= pd.DataFrame(file, columns=['text'])
df

9


Unnamed: 0,text
0,\n \nMARKET RESEARCH \nREPORT: ANALYSIS OF \...
1,\nIntroduction \nThis market research report...
2,\nExecutive Summary \nThis section provides ...
3,\n4. Merge PDF: Enable merging multiple PDFs...
4,\n• Process Modes \n1. Professional Transl...
5,\nResults \nComparison Table \n \n \n \n \...
6,\nPricing Plans for Paid Tool s \nTool Plan ...
7,\n \nDetailed Findings \nDoctranslator \n• ...
8,\nTranslaDocs \n• Cons: \no No OCR support...


## Text Preprocessing

In [3]:
df

Unnamed: 0,text
0,\n \nMARKET RESEARCH \nREPORT: ANALYSIS OF \...
1,\nIntroduction \nThis market research report...
2,\nExecutive Summary \nThis section provides ...
3,\n4. Merge PDF: Enable merging multiple PDFs...
4,\n• Process Modes \n1. Professional Transl...
5,\nResults \nComparison Table \n \n \n \n \...
6,\nPricing Plans for Paid Tool s \nTool Plan ...
7,\n \nDetailed Findings \nDoctranslator \n• ...
8,\nTranslaDocs \n• Cons: \no No OCR support...


In [4]:
def clean_text(text):
    text = str(text)  # Ensure it's a string
    text = re.sub(r'\n{3,}', '\n\n', text)  # 3+ newlines → 2 newlines
    text = re.sub(r'\s+', ' ', text)        # Remove extra spaces
    return text.strip()

df['clean_text'] = df['text'].apply(clean_text)


In [5]:
df

Unnamed: 0,text,clean_text
0,\n \nMARKET RESEARCH \nREPORT: ANALYSIS OF \...,MARKET RESEARCH REPORT: ANALYSIS OF DOCUMENT T...
1,\nIntroduction \nThis market research report...,Introduction This market research report analy...
2,\nExecutive Summary \nThis section provides ...,Executive Summary This section provides a high...
3,\n4. Merge PDF: Enable merging multiple PDFs...,4. Merge PDF: Enable merging multiple PDFs int...
4,\n• Process Modes \n1. Professional Transl...,• Process Modes 1. Professional Translation: T...
5,\nResults \nComparison Table \n \n \n \n \...,Results Comparison Table Tool Layout Preservat...
6,\nPricing Plans for Paid Tool s \nTool Plan ...,Pricing Plans for Paid Tool s Tool Plan Name C...
7,\n \nDetailed Findings \nDoctranslator \n• ...,Detailed Findings Doctranslator • Pros: o Free...
8,\nTranslaDocs \n• Cons: \no No OCR support...,TranslaDocs • Cons: o No OCR support . o No Ar...


## Chunking 

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=30,
    length_function=lambda x: len(x.split()),
    separators=["\n\n\n", "\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""],
    keep_separator=False,
    add_start_index=True,
    strip_whitespace=True
)

In [7]:
combined_text = "\n".join(df['clean_text'].tolist())
print(combined_text)

MARKET RESEARCH REPORT: ANALYSIS OF DOCUMENT TRANSLATION TOOLS Evaluating Leading Solutions for Multilingual Document Translation Mah inour Mohammad
Introduction This market research report analyzes competitors offering document translation tools that support PDF, Word, Excel, and scanned images while preserving layout and formatting. The focus is on tools that handle Arabic, French, and English languages, catering to both B2B and B2C markets. The key features evaluated include layout preservation, Arabic support and quality, translation accuracy and speed, pricing model, and Optical Character Recognition (OCR) support. To assess these tools, a series of test cases were conducted for each language, including: 1. Text -based documents: Evaluating basic translation accuracy, layout preservation, handling of number lists, bullet points, and right -to-left (RTL) and left -to-right (LTR) conversions. 2. Scanned documents: Testing OCR performance, particularly for Arabic, and preservation of

In [8]:
chunks = text_splitter.split_text(combined_text)
print(f"Created {len(chunks)} chunks")

Created 16 chunks


## Embeddings

In [9]:
device= 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',device=device,cache_folder='embedder_model_cache')

In [10]:
def embed_chunks(chunks, model):
    """
    Create embeddings for text chunks using the provided model
    """
    chunks_embed = {}
    chunks_text = {}
    
    for i, chunk in enumerate(chunks):
        chunks_text[i] = chunk
        chunks_embed[i] = model.encode(chunk, convert_to_numpy=True, normalize_embeddings=True)
    
    return chunks_embed, chunks_text
chunks_embed, chunks_text = embed_chunks(chunks, model)

In [11]:
chunks_text

{0: 'MARKET RESEARCH REPORT: ANALYSIS OF DOCUMENT TRANSLATION TOOLS Evaluating Leading Solutions for Multilingual Document Translation Mah inour Mohammad',
 1: 'Introduction This market research report analyzes competitors offering document translation tools that support PDF, Word, Excel, and scanned images while preserving layout and formatting. The focus is on tools that handle Arabic, French, and English languages, catering to both B2B and B2C markets. The key features evaluated include layout preservation, Arabic support and quality, translation accuracy and speed, pricing model, and Optical Character Recognition (OCR) support. To assess these tools, a series of test cases were conducted for each language, including: 1. Text -based documents: Evaluating basic translation accuracy, layout preservation, handling of number lists, bullet points, and right -to-left (RTL) and left -to-right (LTR) conversions. 2. Scanned documents: Testing OCR performance, particularly for Arabic, and pre

In [12]:
chunks_embed

{0: array([-0.02015074,  0.0240155 , -0.01317234, -0.05232674, -0.05509624,
        -0.02333021, -0.06112286, -0.04024085, -0.00325157, -0.01213835,
        -0.02693729, -0.04826334,  0.07293251, -0.02291534,  0.02460909,
        -0.0544789 ,  0.01821917,  0.06841396, -0.01176495, -0.05845089,
         0.01552905,  0.04664472,  0.0293778 , -0.04112574, -0.03306042,
        -0.05064205,  0.01642821, -0.02469871,  0.02860245, -0.05207454,
         0.00932558,  0.14819932,  0.00628153,  0.03053717, -0.05200057,
         0.0946403 , -0.03636624,  0.06938993,  0.02015471,  0.00702015,
        -0.04522998,  0.00932128,  0.01902383,  0.02717949, -0.01786106,
         0.02085477, -0.03127649,  0.0565708 , -0.07617199,  0.0388506 ,
        -0.07956774, -0.02109335, -0.06533948,  0.044004  ,  0.01766661,
        -0.09049686,  0.04742852,  0.05969275,  0.02338422, -0.0207766 ,
         0.01948667,  0.04686711, -0.07097751,  0.02267978, -0.03179061,
        -0.03635461,  0.01067665,  0.0231537 , -

In [13]:
len(chunks_embed.keys())

16

## FAISS Implementation

In [14]:
class FAISS:
    def __init__(self, dimension):
        self.dimension = dimension
        self.index = faiss.IndexFlatIP(dimension)  

    def create_faiss_index(self, chunks_embed):
        """
        Create FAISS index from embeddings dictionary
        """
        # Convert dict to arrays
        indices = list(chunks_embed.keys())
        embeddings = np.array([chunks_embed[idx] for idx in indices]).astype('float32')
        
        # Create FAISS index
        dimension = embeddings.shape[1]
        index = faiss.IndexFlatIP(dimension)
        
        # Add embeddings
        index.add(embeddings)
        
        print(f"Created FAISS index with {index.ntotal} vectors of dimension {dimension}")
        return index, indices

    def search_faiss(self, faiss_index, index_mapping, query_embedding, top_k=5):
        """
        Search FAISS index
        """
        query_embedding = np.array([query_embedding]).astype('float32')
        distances, indices = faiss_index.search(query_embedding, top_k)
        
        results = []
        for i in range(top_k):
            faiss_idx = indices[0][i]
            if faiss_idx != -1:
                your_idx = index_mapping[faiss_idx]
                distance = distances[0][i]
                results.append((your_idx, distance))
        
        return results

    def setup_faiss_search(self, chunks, embeddings_dict):
        """
        Set up FAISS search system
        """
        # Create chunks dictionary
        chunks_dict = {i: chunk for i, chunk in enumerate(chunks)}
        
        # Create FAISS index
        faiss_index, index_mapping = self.create_faiss_index(embeddings_dict)
        
        return faiss_index, index_mapping, chunks_dict

    def search_chunks(self, faiss_index, index_mapping, chunks_dict, query_embedding, top_k=5):
        """
        Search for similar chunks
        """
        # Search FAISS
        results = self.search_faiss(faiss_index, index_mapping, query_embedding, top_k)
        
        # Format results
        formatted_results = []
        for chunk_idx, distance in results:
            formatted_results.append({
                'chunk_id': chunk_idx,
                'text': chunks_dict[chunk_idx],
                'distance': distance,
                'similarity': 1 / (1 + distance) 
            })
        
        return formatted_results

    def search_with_text_query(self, faiss_index, index_mapping, chunks_dict, model, query_text, top_k=5):
        """
        Search using text query (encode query first)
        """
        query_embedding = model.encode(query_text, convert_to_numpy=True, normalize_embeddings=True)
        return self.search_chunks(faiss_index, index_mapping, chunks_dict, query_embedding, top_k)

    def save_faiss_index(self, faiss_index, file_index_name):
        """Save FAISS index to disk"""
        faiss.write_index(faiss_index, f"{file_index_name}.faiss")
        print(f"Index saved to {file_index_name}.faiss")

    def load_faiss_index(self, file_index_name):
        """Load FAISS index from disk"""
        index = faiss.read_index(f"{file_index_name}.faiss")
        print(f"Index loaded from {file_index_name}.faiss")
        return index

    def quick_search_test(self, faiss_index, index_mapping, chunks_dict, model, prompt):
        """Quick test to verify search works"""
        query_embedding = model.encode(prompt, convert_to_numpy=True, normalize_embeddings=True)
        results = self.search_faiss(faiss_index, index_mapping, query_embedding, top_k=5)
        
        print(f"Search results for '{prompt}':")
        for chunk_id, distance in results:
            chunk_text = chunks_dict[chunk_id]
            print(f"Chunk {chunk_id} (distance: {distance:.4f}): {chunk_text[:800]}...")
        
        return results

In [15]:
print(chunks_embed)

{0: array([-0.02015074,  0.0240155 , -0.01317234, -0.05232674, -0.05509624,
       -0.02333021, -0.06112286, -0.04024085, -0.00325157, -0.01213835,
       -0.02693729, -0.04826334,  0.07293251, -0.02291534,  0.02460909,
       -0.0544789 ,  0.01821917,  0.06841396, -0.01176495, -0.05845089,
        0.01552905,  0.04664472,  0.0293778 , -0.04112574, -0.03306042,
       -0.05064205,  0.01642821, -0.02469871,  0.02860245, -0.05207454,
        0.00932558,  0.14819932,  0.00628153,  0.03053717, -0.05200057,
        0.0946403 , -0.03636624,  0.06938993,  0.02015471,  0.00702015,
       -0.04522998,  0.00932128,  0.01902383,  0.02717949, -0.01786106,
        0.02085477, -0.03127649,  0.0565708 , -0.07617199,  0.0388506 ,
       -0.07956774, -0.02109335, -0.06533948,  0.044004  ,  0.01766661,
       -0.09049686,  0.04742852,  0.05969275,  0.02338422, -0.0207766 ,
        0.01948667,  0.04686711, -0.07097751,  0.02267978, -0.03179061,
       -0.03635461,  0.01067665,  0.0231537 , -0.12753163,  

In [16]:
faiss_instance = FAISS(dimension=chunks_embed[0].shape[0])
faiss_index, index_mapping, chunks_dict = faiss_instance.setup_faiss_search(chunks, chunks_embed)
faiss_instance.save_faiss_index(faiss_index, "market_research_report_index")
faiss_index = faiss_instance.load_faiss_index("market_research_report_index")
faiss_instance.quick_search_test(faiss_index, index_mapping, chunks_dict, model, "What are the recommended features to be added to the project?")

Created FAISS index with 16 vectors of dimension 384
Index saved to market_research_report_index.faiss
Index loaded from market_research_report_index.faiss
Search results for 'What are the recommended features to be added to the project?':
Chunk 4 (distance: 0.4394): Based on the evaluation, several opportunities exist to enhance document translation tools to better serve B2B and B2C markets. The following recommended features aim to address ga ps identified in the tested tools and improve functionality, user experience, and translation quality: • Editing Capabilities 1. PDF Editing: Enable users to add text, shapes, images, and freehand annotations to PDFs, facilitating document customization and correction post -translation. 2. PDF Annotations: Provide tools to write, draw, and highlight directly on PDFs, enhancing collaboration and docu ment review processes. 3. Split PDF: Allow users to split a PDF into multiple PDFs, with each page saved as a separate file. Include advanced option

[(4, 0.43941298),
 (8, 0.38262424),
 (1, 0.35619342),
 (10, 0.3311379),
 (6, 0.2651277)]

## LLM Integration

In [17]:
class RAG:
    def __init__(self):
        self.model_name = "qwen3:8b"
    
    def get_relevant_chunks(self, query, faiss_index, index_mapping, chunks_dict, faiss_instance, embedding_model, top_k=3):
        """Get relevant chunks for a query"""
        results = faiss_instance.search_with_text_query(
            faiss_index, 
            index_mapping, 
            chunks_dict, 
            embedding_model, 
            query, 
            top_k
        )
        return results
    
    def clean_response(self, response_text):
        """Remove <think> tags and other unwanted elements"""
        cleaned = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL)
        cleaned = re.sub(r'<[^>]+>', '', cleaned)
        cleaned = re.sub(r'\n\s*\n', '\n\n', cleaned.strip())
        
        return cleaned
    
    def generate_structured_answer(self, query, relevant_chunks):
        """Generate structured answer with response, reasoning, and sources"""
        context_with_sources = ""
        source_map = {}
        
        for chunk in relevant_chunks:
            chunk_id = chunk['chunk_id']
            source_map[chunk_id] = chunk['text']
            context_with_sources += f"[Source {chunk_id}]: {chunk['text']}\n\n"
        
        prompt = f"""You are a helpful assistant. Analyze the context and provide a structured response.

        Context:
        {context_with_sources}

        Question: {query}

        Please provide your response in exactly this format:

        RESPONSE:
        [Your direct, concise answer to the question]

        REASONING:
        [Brief explanation of how you arrived at this answer using the sources]

        SOURCES:
        [List the source numbers that support your answer, e.g., 1, 2, 3]

        Important: Do not include any <think> tags or internal reasoning. Be direct and concise."""
        

        llm_output = ollama.generate(
            model=self.model_name,
            prompt=prompt,
            options={
                'temperature': 0.3,
                'num_ctx': 4096, ## 300 words per request (PROMPT + CONTEXT + Instructions)
            }
        )
        

        response_text = self.clean_response(llm_output['response'])
        return self.parse_structured_response(response_text, relevant_chunks)
    
    def parse_structured_response(self, response_text, relevant_chunks):
        """Parse the structured response into separate components""" 
        sections = {'response': '', 'reasoning': '', 'sources': ''}
        current_section = None
        current_content = []
        
        lines = response_text.split('\n')
        
        for line in lines:
            line = line.strip()
            
            if line.upper().startswith('RESPONSE:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'response'
                current_content = [line[9:].strip()] 
                
            elif line.upper().startswith('REASONING:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'reasoning'
                current_content = [line[10:].strip()]  
                
            elif line.upper().startswith('SOURCES:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'sources'
                current_content = [line[8:].strip()] 
                
            elif current_section and line:
                current_content.append(line)
        

        if current_section:
            sections[current_section] = '\n'.join(current_content).strip()
        

        source_ids = []
        if sections['sources']:
            source_text = sections['sources']
            source_ids = [int(x) for x in re.findall(r'\d+', source_text)]
        

        source_texts = []
        for chunk in relevant_chunks:
            if chunk['chunk_id'] in source_ids:
                source_texts.append(chunk['text'])
        
        return {
            'response': sections['response'],
            'reasoning': sections['reasoning'],
            'sources': source_ids,
            'source_texts': source_texts,
            'raw_sources': [f"Chunk {chunk['chunk_id']}" for chunk in relevant_chunks if chunk['chunk_id'] in source_ids]
        }
    
    def generate_answer(self, query, relevant_chunks):
        """Original method - now returns structured response"""
        return self.generate_structured_answer(query, relevant_chunks)
    
    def ask_rag_question(self, query, faiss_index, index_mapping, chunks_dict, faiss_instance, embedding_model, top_k=3):
        """Enhanced RAG question answering with structured response"""
        

        relevant_chunks = self.get_relevant_chunks(
            query, faiss_index, index_mapping, chunks_dict, 
            faiss_instance, embedding_model, top_k
        )
    
        result = self.generate_structured_answer(query, relevant_chunks)
        
        return {
            'answer': result['response'],           # Clean, direct answer
            'reasoning': result['reasoning'],       # Explanation
            'sources': result['sources'],           # Source IDs [1, 2, 3]
            'source_texts': result['source_texts'], # Actual source content
            'chunks_used': len(relevant_chunks),
            'raw_sources': result['raw_sources']    # ["Chunk 1", "Chunk 2"]
        }
    
    def ask_simple_question(self, query, faiss_index, index_mapping, chunks_dict, faiss_instance, embedding_model, top_k=3):
        """Simple version that returns only the clean answer"""
        
        result = self.ask_rag_question(query, faiss_index, index_mapping, chunks_dict, faiss_instance, embedding_model, top_k)
        
        return result['answer']

In [18]:
rag = RAG()
result = rag.ask_rag_question("What are the recommended features to be added to the project?", faiss_index, index_mapping, chunks_dict, faiss_instance, model)
answer = result['answer']         
reasoning = result['reasoning'] 
sources = result['sources']    
source_texts = result['source_texts'] 

print("ANSWER:", answer)
print("REASONING:", reasoning)  
print("SOURCES:", sources)

ANSWER: The recommended features include PDF editing/annotations, real-time translation comparison, flexible OCR options, domain-specific translation support, layout preservation, RTL/LTR handling, and table alignment/OCR for scanned documents.
REASONING: The sources highlight gaps in current tools, such as limited editing capabilities (Source 4), lack of real-time review (Source 8), insufficient OCR flexibility (Sources 1 and 8), and challenges with layout preservation and RTL/LTR conversions (Source 1). These features address these gaps.
SOURCES: [1, 4, 8]


In [19]:
result

{'answer': 'The recommended features include PDF editing/annotations, real-time translation comparison, flexible OCR options, domain-specific translation support, layout preservation, RTL/LTR handling, and table alignment/OCR for scanned documents.',
 'reasoning': 'The sources highlight gaps in current tools, such as limited editing capabilities (Source 4), lack of real-time review (Source 8), insufficient OCR flexibility (Sources 1 and 8), and challenges with layout preservation and RTL/LTR conversions (Source 1). These features address these gaps.',
 'sources': [1, 4, 8],
 'source_texts': ["Based on the evaluation, several opportunities exist to enhance document translation tools to better serve B2B and B2C markets. The following recommended features aim to address ga ps identified in the tested tools and improve functionality, user experience, and translation quality: • Editing Capabilities 1. PDF Editing: Enable users to add text, shapes, images, and freehand annotations to PDFs, f