## Imports

In [None]:
import json
import pandas as pd
import re
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.llms import Ollama
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import numpy as np
import faiss
from langchain.docstore import InMemoryDocstore

from langchain_community.vectorstores import FAISS

## RAG Pipeline with LangChain

In [None]:
class LangChainRAGPipeline:
    def __init__(self):
        self.text_splitter = None
        self.embeddings = None
        self.LLM_Model = 'qwen3:8b'
        self.vectorstore = None
        self.qa_chain = None
        self.top_k = 5
        self.normalize_embeddings = True
        self.device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
        self._setup_components()


    
    def _setup_components(self, embedder_model="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        """Initialize all LangChain components"""
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=200,
            chunk_overlap=30,
            length_function=lambda x: len(x.split()),
            separators=["\n\n\n", "\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""],
            keep_separator=False,
            add_start_index=True,
            strip_whitespace=True
        )
        
        self.embeddings = HuggingFaceEmbeddings(
            model_name=embedder_model,
            model_kwargs={'device': self.device},
            cache_folder='embedder_model_cache',
            encode_kwargs={'normalize_embeddings': self.normalize_embeddings}
        )
        
        self.LLM_Model = Ollama(
            model=self.LLM_Model,
            temperature=0.3,
            num_ctx=4096
        )


    
    def load_and_preprocess_data(self, file_path):
        """Load data and create LangChain Documents"""
        with open(file_path, 'r') as f:
            data = json.load(f)
        
        df = pd.DataFrame(data, columns=['text'])
        df['clean_text'] = df['text'].apply(self._clean_text)
        combined_text = "\n".join(df['clean_text'].tolist())
        documents = [Document(page_content=combined_text, metadata={"source": file_path})]
        return documents
    


    
    def _clean_text(self, text):
        """Clean text data"""
        text = str(text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    


    
    def create_vectorstore(self, documents, save_path=None):
        """Create and optionally save FAISS vectorstore with cosine similarity"""
        chunks = self.text_splitter.split_documents(documents)
        print(f"Created {len(chunks)} chunks")

        
        # Get embeddings for all chunks
        texts = [chunk.page_content for chunk in chunks]
        embeddings_matrix = self.embeddings.embed_documents(texts)
        embeddings_matrix = np.array(embeddings_matrix, dtype=np.float32)

        # Create FAISS index with Inner Product (which equals cosine similarity for normalized vectors)
        dimension = embeddings_matrix.shape[1]
        index = faiss.IndexFlatIP(dimension) 
        index.add(embeddings_matrix)
        
        # Create FAISS vectorstore
        
        self.vectorstore = FAISS(
            embedding_function=self.embeddings,
            index=index,
            docstore=InMemoryDocstore({str(i): chunks[i] for i in range(len(chunks))}),
            index_to_docstore_id={i: str(i) for i in range(len(chunks))}
        )
        
        if save_path:
            self.vectorstore.save_local(save_path)
            print(f"Vectorstore saved to {save_path}")
        return self.vectorstore
    

    
    def load_vectorstore(self, load_path):
        """Load existing vectorstore"""
        self.vectorstore = FAISS.load_local(load_path, self.embeddings)
        print(f"Vectorstore loaded from {load_path}")
        return self.vectorstore
    
    
    def setup_qa_chain(self):
        """Setup QA chain with custom prompts"""
        if not self.vectorstore:
            raise ValueError("Vectorstore not initialized. Call create_vectorstore() or load_vectorstore() first.")
        
        retriever = self.vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": self.top_k}
        )
        
        print("RETRIEVER is :", retriever)
        prompt_template = """You are a helpful assistant. Analyze the context and provide a structured response.

                Context:
                {context}

                Question: {question}

                Please provide your response in exactly this format:

                RESPONSE:
                [Your direct, concise answer to the question]

                REASONING:
                [Brief explanation of how you arrived at this answer using the sources]

                SOURCES:
                [List the source numbers that support your answer, e.g., 1, 2, 3]

                Important: Do not include any <think> tags or internal reasoning. Be direct and concise."""

        PROMPT = PromptTemplate(
                template=prompt_template,
                input_variables=["context", "question"]
            )
            
        self.qa_chain = (
                {
                    "context": retriever | self._format_docs,
                    "question": RunnablePassthrough()
                }
                | PROMPT
                | self.LLM_Model
                | StrOutputParser()
            )
        
        return self.qa_chain
    


    def _format_docs(self, docs):
        """Format documents for context"""
        formatted = ""
        for i, doc in enumerate(docs, 1):
            formatted += f"[Source {i}]: {doc.page_content}\n\n"
        return formatted
    

    
    def ask_question(self, question, return_sources=True):
        """Ask a question and get structured response"""
        if not self.qa_chain:
            raise ValueError("QA chain not initialized. Call setup_qa_chain() first.")
        response = self.qa_chain.invoke(question)
        parsed_response = self._parse_structured_response(response)
        
        if return_sources:
            retriever = self.vectorstore.as_retriever(
                search_type="similarity",
                search_kwargs={"k": self.top_k}
            )
            source_docs = retriever.get_relevant_documents(question)
            parsed_response['source_documents'] = source_docs
            parsed_response['source_texts'] = [doc.page_content for doc in source_docs]
        
        return parsed_response
    
    

    def _parse_structured_response(self, response_text):
        """Parse the structured response"""
        cleaned_response = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL)
        cleaned_response = re.sub(r'<[^>]+>', '', cleaned_response)
        cleaned_response = re.sub(r'\n\s*\n', '\n\n', cleaned_response.strip())
        
        sections = {'response': '', 'reasoning': '', 'sources': ''}
        current_section = None
        current_content = []
        
        lines = cleaned_response.split('\n')
        
        for line in lines:
            line = line.strip()
            
            if line.upper().startswith('RESPONSE:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'response'
                current_content = [line[9:].strip()]
                
            elif line.upper().startswith('REASONING:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'reasoning'
                current_content = [line[10:].strip()]
                
            elif line.upper().startswith('SOURCES:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'sources'
                current_content = [line[8:].strip()]
                
            elif current_section and line:
                current_content.append(line)
        
        if current_section:
            sections[current_section] = '\n'.join(current_content).strip()
        
        source_ids = []
        if sections['sources']:
            source_text = sections['sources']
            source_ids = [int(x) for x in re.findall(r'\d+', source_text)]
        
        return {
            'answer': sections['response'],
            'reasoning': sections['reasoning'],
            'sources': source_ids,
            'raw_response': cleaned_response
        }
    

    def similarity_search(self, query, k=None):
        """Perform similarity search"""
        if not self.vectorstore:
            raise ValueError("Vectorstore not initialized.")
        
        k = k or self.top_k
        return self.vectorstore.similarity_search(query, k=k)
    

    def similarity_search_with_score(self, query, k=None):
        """Perform similarity search with scores (now properly cosine similarity)"""
        if not self.vectorstore:
            raise ValueError("Vectorstore not initialized.")
        
        k = k or self.top_k
        results = self.vectorstore.similarity_search_with_score(query, k=k)
    
        print(f"Similarity scores for query '{query}':")
        for i, (doc, score) in enumerate(results):
            print(f"  Result {i+1}: Score = {score:.4f}")
            
        return results

In [36]:
pipeline = LangChainRAGPipeline()
documents = pipeline.load_and_preprocess_data('Market Research Report_extracted_text.json')
vectorstore = pipeline.create_vectorstore(documents, save_path="market_research_vectorstore")
qa_chain = pipeline.setup_qa_chain()
pipeline.ask_question("What are the recommended features to be added to the project?")
result = pipeline.ask_question("What are the recommended features to be added to the project?")


Created 16 chunks
Vectorstore saved to market_research_vectorstore
RETRIEVER is : tags=['FAISS', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x429dff1f0> search_kwargs={'k': 5}


In [37]:
result

{'answer': 'The recommended features include PDF editing/annotations, real-time translation review, domain-specific translation, tone customization, AI summarization, and enhanced OCR support.',
 'reasoning': 'The sources highlight gaps in current tools, such as limited editing capabilities (Sources 1, 2), lack of domain-specific/tone adjustments (Sources 2, 5), insufficient OCR options (Source 3), and the need for AI-driven features like summarization (Source 5). These recommendations aim to improve functionality and user experience.',
 'sources': [1, 2, 3, 5],
 'raw_response': 'RESPONSE:  \nThe recommended features include PDF editing/annotations, real-time translation review, domain-specific translation, tone customization, AI summarization, and enhanced OCR support.  \n\nREASONING:  \nThe sources highlight gaps in current tools, such as limited editing capabilities (Sources 1, 2), lack of domain-specific/tone adjustments (Sources 2, 5), insufficient OCR options (Source 3), and the 

In [38]:
pipeline.similarity_search("What are the recommended features to be added to the project?", k=3)

[Document(metadata={'source': 'Market Research Report_extracted_text.json', 'start_index': 3115}, page_content="Based on the evaluation, several opportunities exist to enhance document translation tools to better serve B2B and B2C markets. The following recommended features aim to address ga ps identified in the tested tools and improve functionality, user experience, and translation quality: • Editing Capabilities 1. PDF Editing: Enable users to add text, shapes, images, and freehand annotations to PDFs, facilitating document customization and correction post -translation. 2. PDF Annotations: Provide tools to write, draw, and highlight directly on PDFs, enhancing collaboration and docu ment review processes. 3. Split PDF: Allow users to split a PDF into multiple PDFs, with each page saved as a separate file. Include advanced options, such as specifying page ranges or programmatic splitting (similar to Python's split() function), to offer fl exibility."),
 Document(metadata={'source': 

In [39]:
pipeline.similarity_search_with_score("What are the recommended features to be added to the project?", k=5)

Similarity scores for query 'What are the recommended features to be added to the project?':
  Result 1: Score = 0.4394
  Result 2: Score = 0.3826
  Result 3: Score = 0.3562
  Result 4: Score = 0.3311
  Result 5: Score = 0.2651


[(Document(metadata={'source': 'Market Research Report_extracted_text.json', 'start_index': 3115}, page_content="Based on the evaluation, several opportunities exist to enhance document translation tools to better serve B2B and B2C markets. The following recommended features aim to address ga ps identified in the tested tools and improve functionality, user experience, and translation quality: • Editing Capabilities 1. PDF Editing: Enable users to add text, shapes, images, and freehand annotations to PDFs, facilitating document customization and correction post -translation. 2. PDF Annotations: Provide tools to write, draw, and highlight directly on PDFs, enhancing collaboration and docu ment review processes. 3. Split PDF: Allow users to split a PDF into multiple PDFs, with each page saved as a separate file. Include advanced options, such as specifying page ranges or programmatic splitting (similar to Python's split() function), to offer fl exibility."),
  0.43941298),
 (Document(met

In [41]:
pipeline.ask_question("Which translator charges users with credits?")

{'answer': 'Doctranslate.io',
 'reasoning': 'The pricing plans for Doctranslate.io (Source 1 and Source 5) explicitly mention credit-based pricing, such as "translation credits," "credit expiration," and "Topup" options. Other tools like DeepL Pro and Doclingo use per-user/month or character-based pricing without a credit system.',
 'sources': [1, 5],
 'raw_response': 'RESPONSE:  \nDoctranslate.io  \n\nREASONING:  \nThe pricing plans for Doctranslate.io (Source 1 and Source 5) explicitly mention credit-based pricing, such as "translation credits," "credit expiration," and "Topup" options. Other tools like DeepL Pro and Doclingo use per-user/month or character-based pricing without a credit system.  \n\nSOURCES:  \n1, 5',
 'source_documents': [Document(metadata={'source': 'Market Research Report_extracted_text.json', 'start_index': 8069}, page_content='Pricing Plans for Paid Tool s Tool Plan Name Cost Key Features Benefits Notes Doctranslate.io Topup -50 $4.99 50 translation credit • 10

In [42]:
pipeline.ask_question("What is the main topic?")

{'answer': 'The main topic is the evaluation and enhancement of document translation tools for multilingual support and improved functionality.',
 'reasoning': "The context focuses on analyzing competitors' tools (Source 1) and recommending features to address gaps in translation quality, editing capabilities, OCR support, and domain-specific adaptations (Sources 2–5). These sources collectively emphasize improving document translation tools for B2B/B2C markets.",
 'sources': [1, 2, 3, 4, 5],
 'raw_response': "RESPONSE:  \nThe main topic is the evaluation and enhancement of document translation tools for multilingual support and improved functionality.  \n\nREASONING:  \nThe context focuses on analyzing competitors' tools (Source 1) and recommending features to address gaps in translation quality, editing capabilities, OCR support, and domain-specific adaptations (Sources 2–5). These sources collectively emphasize improving document translation tools for B2B/B2C markets.  \n\nSOURCES: 