In [1]:
!pip install langchain langchain-openai langchain-community chromadb sentence-transformers beautifulsoup4 requests langchain sentence-transformers faiss-cpu euri



ERROR: Could not find a version that satisfies the requirement euri (from versions: none)
ERROR: No matching distribution found for euri


In [2]:
import os, io
from PyPDF2 import PdfReader
import pandas as pd
import numpy as np
import pytesseract
from PIL import Image
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from transformers import CLIPModel, CLIPProcessor
import torch

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY
os.environ["OPENAI_BASE_URL"] = "https://api.euron.one/api/v1/euri"
PDF_PATH = r"C:/Users\Data Science/Euron/Chatbots/Bajaj Finserv/kb/bajaj_finserv_factsheet_Oct.pdf"
FAISS_INDEX_PATH = "faiss_index_bajaj"


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def extract_text_pypdf2(pdf_path):
    reader = PdfReader(pdf_path)
    pages = []
    for i, page in enumerate(reader.pages):
        text = page.extract_text() or ""
        pages.append({
            "page": i + 1,
            "text": text.strip()
        })
    return pages

pages_data = extract_text_pypdf2(PDF_PATH)
print(f"âœ… Extracted text from {len(pages_data)} pages.")


âœ… Extracted text from 56 pages.


In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def clean_text(txt):
    return " ".join((txt or "").replace("\n", " ").split())

def create_chunks(pages, chunk_size=1000, overlap=200):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = []
    for p in pages:
        text = clean_text(p["text"])
        for i, chunk in enumerate(splitter.split_text(text)):
            chunks.append({
                "text": chunk,
                "page": p["page"],
                "chunk_id": f"p{p['page']}_c{i}",
                "type": "text"
            })
    return chunks

text_chunks = create_chunks(pages_data)
print(f"âœ… Created {len(text_chunks)} text chunks.")


âœ… Created 309 text chunks.


In [5]:
import pdfplumber
def extract_tables(pdf_path):
    tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            page_tables = page.extract_tables()
            for t in page_tables:
                try:
                    df = pd.DataFrame(t[1:], columns=t[0])
                    tables.append({"page": i + 1, "table": df})
                except Exception:
                    continue
    return tables

tables = extract_tables(PDF_PATH)
print(f"âœ… Extracted {len(tables)} tables.")

âœ… Extracted 222 tables.


In [6]:
def tables_to_chunks(tables):
    table_chunks = []
    for t in tables:
        csv_text = t["table"].to_csv(index=False)
        table_chunks.append({
            "text": csv_text,
            "page": t["page"],
            "chunk_id": f"p{t['page']}_table",
            "type": "table"
        })
    return table_chunks

table_chunks = tables_to_chunks(tables)
print(f"âœ… Converted {len(table_chunks)} tables into chunks.")


âœ… Converted 222 tables into chunks.


In [7]:
# ---- OpenAI text embeddings ----
text_embedder = OpenAIEmbeddings(model="text-embedding-3-small")

# ---- CLIP model for images (optional) ----
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def embed_images_clip(dummy_pages):
    """No page images in PyPDF2; keep for future OCR support."""
    return np.empty((0, 512))

# Text + Table embeddings
all_text_chunks = text_chunks + table_chunks
texts = [c["text"] for c in all_text_chunks]
metadatas = all_text_chunks
text_vectors = np.array(text_embedder.embed_documents(texts))
print(f"âœ… Generated {text_vectors.shape[0]} text embeddings.")


  text_embedder = OpenAIEmbeddings(model="text-embedding-3-small")
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 1 files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<?, ?it/s]


âœ… Generated 531 text embeddings.


In [13]:
from langchain.vectorstores import FAISS

# Combine text + embeddings into pairs
text_embedding_pairs = list(zip(texts, text_vectors))

# Build FAISS index
db = FAISS.from_embeddings(
    text_embedding_pairs,   # (text, embedding) tuples
    text_embedder,          # embedding model reference
    metadatas=metadatas     # optional metadata per chunk
)

# Save index locally
db.save_local("faiss_index_bajaj")

print(f"âœ… FAISS index saved successfully with {len(text_embedding_pairs)} entries.")


âœ… FAISS index saved successfully with 531 entries.


In [14]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# Load embeddings
text_embedder = OpenAIEmbeddings(model="text-embedding-3-small")

# Load FAISS index from local storage
db = FAISS.load_local(
    "faiss_index_bajaj",
    text_embedder,
    allow_dangerous_deserialization=True
)

print("âœ… FAISS index loaded successfully!")


âœ… FAISS index loaded successfully!


In [15]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# LLM for generating answers
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Turn FAISS into retriever
retriever = db.as_retriever(search_kwargs={"k": 4})  # top 4 chunks

# RAG chain for Q&A
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

print("âœ… RetrievalQA chain ready!")


âœ… RetrievalQA chain ready!


In [16]:
query = "What is the 3-year CAGR for Bajaj Growth Fund?"
response = qa_chain({"query": query})

print("ðŸ§  Answer:")
print(response["result"])

print("\nðŸ“„ Sources:")
for doc in response["source_documents"]:
    meta = doc.metadata
    print(f"- Page: {meta.get('page')} | Chunk: {meta.get('chunk_id')}")


  response = qa_chain({"query": query})


PermissionDeniedError: Error code: 403 - {'error': {'message': 'Daily token limit of 103471 reached. Please try again tomorrow.', 'type': 'forbidden', 'code': 'permission_denied'}, 'success': False, 'statusCode': 403}