In [2]:
pip install langchain-text-splitters




In [14]:
pip install langchain-community  

Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install langchain pypdf chromadb sentence-transformers




In [13]:
pip install unstructured lxml




In [1]:
pip install pypdf pathlib

Collecting pathlib
  Downloading pathlib-1.0.1-py3-none-any.whl.metadata (5.1 kB)
Downloading pathlib-1.0.1-py3-none-any.whl (14 kB)
Installing collected packages: pathlib
Successfully installed pathlib-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
 
import os
from dotenv import load_dotenv, find_dotenv
from pathlib import Path

# Load .env (keeps API keys out of source files)
filepath = r"C:\Users\Nakul\rag_api.env"
load_dotenv(find_dotenv(filepath)) 

# ========== Configuration (update these) ==========
PDF_FOLDER_PATH = r"D:\\Investment docs"          # folder with PDF prospectuses
CHROMA_PERSIST_DIR = "./chromadb_store"            # where Chroma will persist
CHROMA_COLLECTION = "financial_prospectuses"       # Chroma collection name
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"          # L6-mini
CHUNK_SIZE = 700
CHUNK_OVERLAP = 150
BATCH_EMBED = 64
TOP_K = 4

# LLM settings (Gemini via LangChain or local fallback)
GEMINI_MODEL = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")       # ensure this exists for Gemini via langchain_google_genai
LOCAL_TGI_ENDPOINT = os.getenv("LOCAL_TGI_ENDPOINT", "http://localhost:8080/v1/models/gemin:predict")

# Create persistent directories if missing
Path(CHROMA_PERSIST_DIR).mkdir(parents=True, exist_ok=True)

# Quick print to confirm
print("Configuration summary:")
print(f"  PDF_FOLDER_PATH = {PDF_FOLDER_PATH}")
print(f"  CHROMA_PERSIST_DIR = {CHROMA_PERSIST_DIR}")
print(f"  EMBEDDING_MODEL_NAME = {EMBEDDING_MODEL_NAME}")
print(f"  GEMINI_MODEL = {GEMINI_MODEL}")
print(f"  GOOGLE_API_KEY set? {'YES' if bool(GOOGLE_API_KEY) else 'NO'}")
if GOOGLE_API_KEY:
    print("API key loaded successfully:",GOOGLE_API_KEY)
else:
    print("API key not loaded.")


In [6]:
import os
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# --- Configuration ---
# ⚠️ UPDATE THIS PATH with the fixed, accessible folder containing your PDFs
PDF_FOLDER_PATH = r'D:\Investment docs'
# Define where the vector database will be stored locally
CHROMA_PERSIST_DIR = './chroma_db'
# Use a high-quality, general-purpose Sentence Transformer model
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2' 

# --- A. Document Loading & OCR (Uses PyPDFLoader for text extraction) ---
print(f"--- Step A: Loading documents from {PDF_FOLDER_PATH} ---")
all_documents = []

# Use LangChain's directory loader to handle all PDFs in the folder
try:
    for file_path in Path(PDF_FOLDER_PATH).glob("*.pdf"):
        # PyPDFLoader extracts text and preserves basic metadata (source, page)
        loader = PyPDFLoader(str(file_path))
        documents = loader.load()
        all_documents.extend(documents)
    
    if not all_documents:
        print("Error: No PDF files found or could be loaded. Check PDF_FOLDER_PATH.")
        exit()

except Exception as e:
    print(f"An error occurred during file loading: {e}")
    exit()

print(f"✅ Loaded {len(all_documents)} pages across all documents.")

--- Step A: Loading documents from D:\Investment docs ---
✅ Loaded 188 pages across all documents.


In [None]:
# Assuming 'all_documents' from Step A is loaded

# --- B. Intelligent Chunking ---
# NOTE: The import below is corrected to 'langchain_text_splitters'
from langchain_text_splitters import RecursiveCharacterTextSplitter 

print("--- Step B: Performing Intelligent Chunking ---")
text_splitter = RecursiveCharacterTextSplitter(
    # Chunk size: ~700 characters is a good starting point for LLM context
    chunk_size=700,
    # Overlap: Ensures sentences aren't cut mid-context
    chunk_overlap=150,
    length_function=len 
)

# Split the loaded pages into smaller chunks
chunks = text_splitter.split_documents(all_documents)
print(f"✅ Split into {len(chunks)} chunks for indexing.")

 


# ---------- 1) Recursive chunking (document-level) ----------
print("Step 1: Chunking with RecursiveCharacterTextSplitter")
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    length_function=len
)

# `split_documents` expects a list of LangChain Document objects
chunks = text_splitter.split_documents(all_documents)  # <- uses your loaded pages/documents
print(f"Produced {len(chunks)} chunks from {len(all_documents)} input documents/pages.")

--- Step B: Performing Intelligent Chunking ---
✅ Split into 1427 chunks for indexing.
Step 1: Chunking with RecursiveCharacterTextSplitter
Produced 1427 chunks from 188 input documents/pages.


In [8]:
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import math
import numpy as np
from tqdm.auto import tqdm
import time
import uuid

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

 
# --- C. Text Embedding ---
print(f"--- Step C: Initializing Embedding Model ({EMBEDDING_MODEL_NAME}) ---")

# HuggingFaceEmbeddings loads the specified model ('all-MiniLM-L6-v2') 
# from the Hugging Face Hub (or local cache) and provides the function 
# to convert text chunks into numerical vectors.
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

print("✅ Embedding model loaded successfully.")

# --- D. Vector Storage ---
print(f"--- Step D: Creating and Persisting Vector Store (ChromaDB) ---")

 
vector_store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory=CHROMA_PERSIST_DIR,
    collection_name="financial_prospectuses"
)

# Save the index to disk for later retrieval
vector_store.persist()
print(f"✅ ChromaDB index created and saved to {CHROMA_PERSIST_DIR}")
print("\n--- INGESTION COMPLETE ---")

--- Step C: Initializing Embedding Model (all-MiniLM-L6-v2) ---


  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)


✅ Embedding model loaded successfully.
--- Step D: Creating and Persisting Vector Store (ChromaDB) ---
✅ ChromaDB index created and saved to ./chroma_db

--- INGESTION COMPLETE ---


  vector_store.persist()


In [10]:
import os
# Components for loading the index
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

# Core LangChain components for the RAG chain
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate


# --- Configuration (Must match the Indexing script) ---
CHROMA_PERSIST_DIR = './chroma_db'
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2' 
# LLM_MODEL will be defined in the next section

def initialize_retriever():
    """Loads the vector store and creates a retriever object."""
    print("--- Step E.1: Loading Retriever ---")
    
    # 1. Load the same embedding model used for indexing
    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)

    # 2. Load the persisted vector store (ChromaDB)
    vector_store = Chroma(
        persist_directory=CHROMA_PERSIST_DIR, 
        embedding_function=embeddings,
        collection_name="financial_prospectuses"
    )

    # 3. Convert the vector store into a Retriever
    # search_kwargs={"k": 5} tells it to fetch the top 5 most relevant documents
    retriever = vector_store.as_retriever(search_kwargs={"k": 5})
    print("✅ Retriever initialized from ChromaDB index.")
    
    return retriever

# Run the initialization
retriever = initialize_retriever()

--- Step E.1: Loading Retriever ---
✅ Retriever initialized from ChromaDB index.


  vector_store = Chroma(


In [11]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

print("--- Step F: Building RAG Prompt, Formatter, and Chain ---")

# --- 1. RAG Prompt Template ---
rag_template = """
You are a precise and reliable financial analyst.
Answer the user's question **ONLY** using the information provided in the CONTEXT.
If the answer is not present in the context, reply:
"I don't know (not in document)."

Follow these rules:
- Be concise and factual (4–6 sentences).
- NEVER invent numbers or facts.
- At the end, provide a PROVENANCE section listing:
  Source (file), Page number, Chunk ID.

QUESTION:
{question}

CONTEXT:
{context}

ANSWER:
"""

rag_prompt = ChatPromptTemplate.from_template(rag_template)
print("✓ RAG prompt template created.")





--- Step F: Building RAG Prompt, Formatter, and Chain ---
✓ RAG prompt template created.


In [14]:
# Context formatting helper (Documents → clean text block) ---

def format_docs(docs):
    """
    Converts retriever results (list of Document objects)
    into a human-readable context block with provenance.
    """
    formatted = []
    for i, d in enumerate(docs):
        md = d.metadata
        src = md.get("source", "unknown_source")
        page = md.get("page_num", md.get("page", "?"))
        chunk_id = md.get("chunk_id", f"chunk_{i}")

        header = f"[Chunk {i+1}] Source: {src} | Page: {page} | ChunkID: {chunk_id}"
        formatted.append(f"{header}\n{d.page_content}")

    return "\n\n---\n\n".join(formatted)

format_docs_runnable = RunnableLambda(format_docs)
print("✓ Document formatter created.")

✓ Document formatter created.


In [20]:
# Function Definition (Needs 3 arguments)
def build_rag_chain(llm_object, retriever_object, prompt_object): # <-- ADDED prompt_object
    """Assembles the final RAG chain (Retrieval + Generation)."""
    rag_chain = (
        {
            # Note: Assuming 'format_docs_runnable' is correctly defined elsewhere
            "context": retriever_object | format_docs_runnable,  
            "question": RunnablePassthrough()                      
        }
        | prompt_object # <--- Used here
        | llm_object    
        | StrOutputParser()
    )
    print("✓ RAG chain assembled successfully.")
    return rag_chain


In [21]:
# Simple LLM initialization (Gemini only)

from langchain_google_genai import ChatGoogleGenerativeAI

# Make sure GOOGLE_API_KEY is already exported or set via dotenv
GEMINI_MODEL = "gemini-2.5-flash"

print("Initializing Gemini LLM...")

llm = ChatGoogleGenerativeAI(
    model=GEMINI_MODEL,
    temperature=0.1   # stable, factual answers
)

print(f"✓ Gemini model loaded: {GEMINI_MODEL}")


Initializing Gemini LLM...
✓ Gemini model loaded: gemini-2.5-flash


In [22]:
final_rag_chain = build_rag_chain(llm, retriever, rag_prompt)


✓ RAG chain assembled successfully.


In [25]:
user_question = "What is the investment objective?"
print(f"\n--- Running RAG Query for: {user_question} ---")

try:
    # 2. INVOKE the method on the created chain object
    answer = final_rag_chain.invoke(user_question)
    
    # Run this print statement cleanly
    print(f"\n--- Final Answer ---\n{answer}") 
    
except Exception as e:
    print(f"\nERROR running the chain: {e}")


--- Running RAG Query for: What is the investment objective? ---

--- Final Answer ---
The investment objective of the scheme is to seek to generate regular returns and growth of capital. This is achieved by investing in a diversified portfolio of debt and money market securities. These securities must mature on or before the term of the Scheme. It is important to note that the Scheme does not guarantee or indicate any returns, and there is no assurance that its objectives will be achieved.

PROVENANCE:
Source: D:\Investment docs\KIM - ABSL Interval Income Fund-QP-S I - 311023.pdf, Page: 2, ChunkID: chunk_0


In [12]:
# --- 4. Assemble the RAG Chain using LCEL ---

def assemble_rag_chain(llm_object, retriever_object, prompt_object):
    """Assembles the final RAG chain (Retrieval + Generation)."""
    rag_chain = (
        # 1. Context: Use the retriever to fetch docs and format them
        {"context": retriever_object | RunnableLambda(format_docs), 
         # 2. Question: Pass the original user question through
         "question": RunnablePassthrough()}
        | prompt_object # 3. Inject context and question into the prompt
        | llm_object    # 4. Send the prompt to the LLM
        | StrOutputParser() # 5. Parse the output as a simple string
    )
    print("✅ RAG Chain pipeline assembled.")
    return rag_chain

# Assemble the chain using the components initialized above
final_rag_chain = assemble_rag_chain(llm, retriever, rag_prompt)


# --- 5. Run the Query ---

user_question = "What is the investment objective for the Hartford Small Cap Value Fund and what is the latest total expense ratio?"

print(f"\n--- Running RAG Query ---")
print(f"User Question: {user_question}")

try:
    # Invoke the chain with the user's question
    response = final_rag_chain.invoke(user_question)
    
    print(f"\n--- Final Answer ---\n{response}")
    
except Exception as e:
    print(f"\nERROR: Could not run the RAG chain. Ensure your LLM API key is correctly configured. Error details: {e}")

✅ RAG Chain pipeline assembled.

--- Running RAG Query ---
User Question: What is the investment objective for the Hartford Small Cap Value Fund and what is the latest total expense ratio?

--- Final Answer ---
The information regarding the investment objective for the Hartford Small Cap Value Fund and its latest total expense ratio is not available in the provided documents.
