## Imports

In [1]:
import json
import pandas as pd
import re
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.llms import Ollama
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import numpy as np
import faiss
from langchain.docstore import InMemoryDocstore

from langchain_community.vectorstores import FAISS



## RAG Pipeline with LangChain

In [13]:
class LangChainRAGPipeline:
    def __init__(self):

        self.text_splitter = None
        self.embeddings = None
        self.LLM_Model = 'qwen3:8b'
        self.vectorstore = None
        self.qa_chain = None
        self.top_k = 5
        self.normalize_embeddings = True
        self.device = 'mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu'
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=200,
            chunk_overlap=30, length_function=lambda x: len(x.split()),
            separators=["\n\n\n", "\n\n", "\n", ". ", "! ", "? ", "; ", ", ", " ", ""],
            keep_separator=False,
            add_start_index=True,
            strip_whitespace=True
        )
        self.embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", model_kwargs={'device': self.device},
            cache_folder='embedder_model_cache', encode_kwargs={'normalize_embeddings': self.normalize_embeddings}
        )
        self.LLM_Model = Ollama( model=self.LLM_Model, temperature=0.3, num_ctx=4096)



    def load_and_preprocess_data(self, file_paths):
        combined_texts = []
        """Load and preprocess data from multiple JSON files."""
        for file_path in file_paths:
            """Load JSON lines data and create a single LangChain Document with cleaned content."""
            with open(file_path, 'r') as f:
                raw_data = json.load(f)

            clean_texts = [self._clean_text(entry) for entry in raw_data if isinstance(entry, str)]

            # Join all cleaned text into a single document
            combined_text = "\n".join(clean_texts)
            combined_texts.append(combined_text)
        # Create LangChain Documents
        documents = [Document(page_content=text) for text in combined_texts if text.strip()]
        return documents


         
        
    

    def _batch_embed(self, texts, batch_size=64):
        embeddings = []
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            batch_embeddings = self.embeddings.embed_documents(batch)
            embeddings.extend(batch_embeddings)
        return np.array(embeddings, dtype=np.float32)

    
    
    def _clean_text(self, text):
        """Clean text data"""
        text = str(text)
        text = re.sub(r'\n{3,}', '\n\n', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    
    def create_vectorstore(self, documents, save_path=None):
        """Create and optionally save FAISS vectorstore with cosine similarity"""
        chunks = self.text_splitter.split_documents(documents)
        print(f"Created {len(chunks)} chunks")

        # Get embeddings for all chunks
        texts = [chunk.page_content for chunk in chunks]
        embeddings_matrix = self._batch_embed(texts, batch_size=128)
        

        # Create FAISS index with Inner Product (which equals cosine similarity for normalized vectors)
        dimension = embeddings_matrix.shape[1]
        index = faiss.IndexFlatIP(dimension)
        index.add(embeddings_matrix)

        

        # Create docstore and index mapping more efficiently
        docstore = InMemoryDocstore({str(i): chunk for i, chunk in enumerate(chunks)})
        index_to_docstore_id = {i: str(i) for i in range(len(chunks))}
        
        # Create FAISS vectorstore
        self.vectorstore = FAISS(
            embedding_function=self.embeddings,
            index=index,
            docstore=docstore,
            index_to_docstore_id=index_to_docstore_id
        )
        
        if save_path:
            self.vectorstore.save_local(save_path)
            print(f"Vectorstore saved to {save_path}")
        
        return self.vectorstore

    
    def load_vectorstore(self, load_path):
        """Load existing vectorstore"""
        self.vectorstore = FAISS.load_local(load_path, self.embeddings)
        print(f"Vectorstore loaded from {load_path}")
        return self.vectorstore
    
    
    def setup_qa_chain(self):
        """Setup QA chain with custom prompts"""
        retriever = self.vectorstore.as_retriever(
            search_type="similarity",
            search_kwargs={"k": self.top_k}
        )
        
        print("RETRIEVER is :", retriever)
        prompt_template = """You are a helpful assistant. Analyze the context and provide a structured response.

                Context:
                {context}

                Question: {question}

                Please provide your response in exactly this format:

                RESPONSE:
                [Your direct, concise answer to the question]

                REASONING:
                [Brief explanation of how you arrived at this answer using the sources]

                SOURCES:
                [List the source numbers that support your answer, e.g., 1, 2, 3]

                Important: Do not include any <think> tags or internal reasoning. Be direct and concise."""

        PROMPT = PromptTemplate(
                template=prompt_template,
                input_variables=["context", "question"]
            )
            
        self.qa_chain = (
                {
                    "context": retriever | self._format_docs,
                    "question": RunnablePassthrough()
                }
                | PROMPT
                | self.LLM_Model
                | StrOutputParser()
            )
        
        return self.qa_chain
    


    def _format_docs(self, docs):
        """Format documents for context"""
        formatted = ""
        for i, doc in enumerate(docs, 1):
            formatted += f"[Source {i}]: {doc.page_content}\n\n"
        return formatted
    

    
    def ask_question(self, question, return_sources=True):
        """Ask a question and get structured response"""
        if not self.qa_chain:
            raise ValueError("QA chain not initialized. Call setup_qa_chain() first.")
        response = self.qa_chain.invoke(question)
        parsed_response = self._parse_structured_response(response)
        
        if return_sources:
            retriever = self.vectorstore.as_retriever(
                search_type="similarity",
                search_kwargs={"k": self.top_k}
            )
            source_docs = retriever.get_relevant_documents(question)
            parsed_response['source_documents'] = source_docs
            parsed_response['source_texts'] = [doc.page_content for doc in source_docs]
        
        return parsed_response
    
    

    def _parse_structured_response(self, response_text):
        """Parse the structured response"""
        cleaned_response = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL)
        cleaned_response = re.sub(r'<[^>]+>', '', cleaned_response)
        cleaned_response = re.sub(r'\n\s*\n', '\n\n', cleaned_response.strip())
        
        sections = {'response': '', 'reasoning': '', 'sources': ''}
        current_section = None
        current_content = []
        
        lines = cleaned_response.split('\n')
        
        for line in lines:
            line = line.strip()
            
            if line.upper().startswith('RESPONSE:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'response'
                current_content = [line[9:].strip()]
                
            elif line.upper().startswith('REASONING:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'reasoning'
                current_content = [line[10:].strip()]
                
            elif line.upper().startswith('SOURCES:'):
                if current_section:
                    sections[current_section] = '\n'.join(current_content).strip()
                current_section = 'sources'
                current_content = [line[8:].strip()]
                
            elif current_section and line:
                current_content.append(line)
        
        if current_section:
            sections[current_section] = '\n'.join(current_content).strip()
        
        source_ids = []
        if sections['sources']:
            source_text = sections['sources']
            source_ids = [int(x) for x in re.findall(r'\d+', source_text)]
        
        return {
            'answer': sections['response'],
            'reasoning': sections['reasoning'],
            'sources': source_ids,
            'raw_response': cleaned_response
        }
    
    def similarity_search_with_score(self, query, k=None):
        """Perform similarity search with scores (now properly cosine similarity)"""
        if not self.vectorstore:
            raise ValueError("Vectorstore not initialized.")
        
        k = k or self.top_k
        results = self.vectorstore.similarity_search_with_score(query, k=k)
    
        print(f"Similarity scores for query '{query}':")
        for i, (doc, score) in enumerate(results):
            print(f"  Result {i+1}: Score = {score:.4f}")
            
        return results

In [22]:
pipeline = LangChainRAGPipeline()
documents = pipeline.load_and_preprocess_data(["Market Research Report_extracted_text.json",'PMS Market Research_extracted_text.json'])
vectorstore = pipeline.create_vectorstore(documents, save_path="vectorstore")
qa_chain = pipeline.setup_qa_chain()



Created 70 chunks
Vectorstore saved to vectorstore
RETRIEVER is : tags=['FAISS', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x386799190> search_kwargs={'k': 5}


In [16]:
result

{'answer': 'Role-based assignments, flexible approval workflows, financial oversight, reporting, notifications, and project archiving.',
 'reasoning': 'The sources emphasize the importance of role-based task assignment (Source 1), automated approval workflows with hierarchies and rules (Sources 1, 4, 5), financial oversight (Sources 1, 5), customizable reporting (Sources 2, 5), real-time notifications (Sources 2, 5), and project archiving (Source 2). These features are critical for streamlining processes, ensuring accountability, and maintaining project transparency.',
 'sources': [1, 2, 4, 5],
 'raw_response': 'RESPONSE:  \nRole-based assignments, flexible approval workflows, financial oversight, reporting, notifications, and project archiving.  \n\nREASONING:  \nThe sources emphasize the importance of role-based task assignment (Source 1), automated approval workflows with hierarchies and rules (Sources 1, 4, 5), financial oversight (Sources 1, 5), customizable reporting (Sources 2, 

In [23]:
pipeline.similarity_search_with_score("What are the recommended features to be added to the project?", k=5)

Similarity scores for query 'What are the recommended features to be added to the project?':
  Result 1: Score = 0.5638
  Result 2: Score = 0.5156
  Result 3: Score = 0.5065
  Result 4: Score = 0.5023
  Result 5: Score = 0.4954


[(Document(metadata={'start_index': 25195}, page_content='Key Benefits & Features: • Role -Based Assignments: You define who does what. Whether it’s the Project Manager approving budgets or the Cost Controller reviewing change requests, each task is automatically sent to the right person. • Flexible Process Design: Build your workflow exactly as your team works it —sequencing steps like Budget Approval, Change Order Review, and Invoice Processing, inserting decision points for “approve” or “reject,” and even allowing parallel reviews when multiple stakeholders need to weigh in. • Smart Approval Rules: Tie approvals to financial thresholds and organizational hierarchies so that small expenses breeze through quickly, while larger commitments rise to senior leadership for sign -off. • Approval hierarchies: PMWeb’s approval hierarchies ensure that every document or subm ission follows a clear, pre -defined review path. Once a user uploads a file and clicks “Save,” the system automatically 

In [18]:
pipeline.ask_question("Which translator charges users with credits?")

{'answer': 'Doctranslate.io',
 'reasoning': 'Sources 1, 3, and 5 explicitly mention Doctranslate.io\'s pricing model based on translation credits (e.g., "50 translation credit," "120 translation credits," and "4000 credits/year"). Other tools like Doclingo use character limits or file-based pricing, not credit systems.',
 'sources': [1, 3, 5],
 'raw_response': 'RESPONSE:  \nDoctranslate.io  \n\nREASONING:  \nSources 1, 3, and 5 explicitly mention Doctranslate.io\'s pricing model based on translation credits (e.g., "50 translation credit," "120 translation credits," and "4000 credits/year"). Other tools like Doclingo use character limits or file-based pricing, not credit systems.  \n\nSOURCES:  \n1, 3, 5',
 'source_documents': [Document(metadata={'start_index': 8069}, page_content='Pricing Plans for Paid Tool s Tool Plan Name Cost Key Features Benefits Notes Doctranslate.io Topup -50 $4.99 50 translation credit • 10 cent/ page . • Credit expiration : permanent . • Perfect for quick, one

In [24]:
pipeline.ask_question("Analyze which combination of tools would be most effective, " \
"justify the cost implications, and identify the critical gaps that would still need to " \
"be addressed through custom development or additional third-party integrations.")

{'answer': 'PMWeb as the core platform, supplemented by Procore for submittal automation and AI-driven workflows, and Aconex for structured document control and communication, would be most effective.',
 'reasoning': 'PMWeb provides comprehensive construction-specific modules (contracts, workflows, financial integration) and role-based task management (Sources 1, 2, 5). Procore enhances submittal and drawing management (Source 3), while Aconex excels in formal transmittals and document control (Source 3, 4). Together, they cover core needs, though cost implications include higher licensing and integration complexity. Critical gaps include limited low-code automation (Source 4) and task management refinements, requiring custom development or third-party tools.',
 'sources': [1, 2, 3, 4, 5],
 'raw_response': 'RESPONSE:  \nPMWeb as the core platform, supplemented by Procore for submittal automation and AI-driven workflows, and Aconex for structured document control and communication, woul

In [21]:
pipeline.similarity_search_with_score("Analyze which combination of tools would be most effective, " \
"justify the cost implications, and identify the critical gaps that would still need to " \
"be addressed through custom development or additional third-party integrations.")

Similarity scores for query 'Analyze which combination of tools would be most effective, justify the cost implications, and identify the critical gaps that would still need to be addressed through custom development or additional third-party integrations.':
  Result 1: Score = 0.5535
  Result 2: Score = 0.5176
  Result 3: Score = 0.5102
  Result 4: Score = 0.4828
  Result 5: Score = 0.4781


[(Document(metadata={'start_index': 19653}, page_content='Construction -Focused PMS s: Havi ng explored versatile, general -purpose tools like Monday.com and Wrike, it’s clear that while they offer tremendous flexibility, they often require heavy customization to meet the nuanced needs of construction projects. Now, let’s shift our focus to purpose -built, construction -centric platforms — PMWeb, Aconex, and Procore. These solutions come with industry -tailored modules out of the box (think robust contract management, formal transmittals, and site -driven workflows), so you can hit the gro und ru nning without reinventing core processes . 3- PMWeb PMW eb is a purpose -built, cloud -based Project Lifecycle Management platform crafted specifically for capital construction projects —government infrastructure, large facilities, and expansive EPC endeavors. Acting as the “central brain” of your project, PMWeb u nifies budgets, contracts, schedules, RFIs, submittals, and close -out activitie

In [26]:
answer=pipeline.ask_question(" If you had to build a hybrid solution using two platforms from the research, which combination would you choose for a $50M solar project, \
                      and how would you handle the integration challenges, particularly around \
                      the 12 core features identified in the research?")

In [31]:
answer

{'answer': 'Choose PMWeb for project management workflows and Procore for construction-specific tools.',
 'reasoning': 'PMWeb (Source 4) excels in role-based task assignment, approval hierarchies, and version control, aligning with the 12 core features (Sources 2, 5). Procore (Source 5) provides specialized construction workflows like submittal tracking and AI-driven drawing management, which are critical for large solar projects. Integration would use APIs to synchronize document approvals, versioning, and task assignments between platforms.',
 'sources': [4, 5, 2],
 'raw_response': 'RESPONSE:  \nChoose PMWeb for project management workflows and Procore for construction-specific tools.  \n\nREASONING:  \nPMWeb (Source 4) excels in role-based task assignment, approval hierarchies, and version control, aligning with the 12 core features (Sources 2, 5). Procore (Source 5) provides specialized construction workflows like submittal tracking and AI-driven drawing management, which are criti

In [25]:
pipeline.similarity_search_with_score(" If you had to build a hybrid solution using two platforms from the research, which combination would you choose for a $50M solar project, \
                      and how would you handle the integration challenges, particularly around \
                      the 12 core features identified in the research?")

Similarity scores for query ' If you had to build a hybrid solution using two platforms from the research, which combination would you choose for a $50M solar project,                       and how would you handle the integration challenges, particularly around                       the 12 core features identified in the research?':
  Result 1: Score = 0.6337
  Result 2: Score = 0.5184
  Result 3: Score = 0.4492
  Result 4: Score = 0.3677
  Result 5: Score = 0.3497


[(Document(metadata={'start_index': 42733}, page_content='Renewable and Solar -Focused Platforms During our research, we also explored platforms that are specifically designed for the renewable energy sector and others that are solar PV -focused . At first, they seemed promising because they’re built with energy systems in mind. However, after testing and reviewing them, we found that they don’t meet the type of project management needs we’re aiming for , although they market them selves as they have project management tools. Platforms like Ra Power Management (RaPM) , SenseHawk, and Payac a are examples of solar - focused systems. These tools are mainly designed for monitoring the performance of solar plants — such as tracking electricity production, system health, faults, and maintenance alerts. While they’re excellent for operations and post -installation monitoring , they do not support document approvals, workflows, submittals, or collaboration between stakeholde rs like contracto