In [17]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import json


import fitz
import logging
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_ollama import OllamaLLM, OllamaEmbeddings
from langchain.schema import Document
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PDFPlumberLoader
from tqdm.notebook import tqdm
logger = logging.getLogger(__name__)





In [2]:
folder_path = "docsJuly"
pdf_files = list(Path(folder_path).glob("*.pdf"))
[(f, round(f.stat().st_size/1024**2, 2)) for f in pdf_files]

[(WindowsPath('docsJuly/Official Statement posted 05-20-2025 (6.4 MB).pdf'),
  6.39),
 (WindowsPath('docsJuly/P11871362-P11431791-P11876664.pdf'), 5.63)]

In [3]:
metadata_path = "./vector_db/metadata.json"
metadata_path = Path(metadata_path)
metadata_path.parent.mkdir(parents=True, exist_ok=True)
with open(metadata_path, 'r', encoding='utf-8') as f:
    metadata = json.load(f)


In [15]:

text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )

In [30]:
loader = PDFPlumberLoader("docsJuly\Official Statement posted 05-20-2025 (6.4 MB).pdf")

  loader = PDFPlumberLoader("docsJuly\Official Statement posted 05-20-2025 (6.4 MB).pdf")


In [32]:
docs = []
for pdf_file in tqdm(pdf_files):
    print(pdf_file)
    loader = PDFPlumberLoader(pdf_file)
    _doc = loader.load()
    print(f"Extracted {len(_doc)} pages from {pdf_file}")
    _doc = text_splitter.split_documents(_doc)
    print(f"Split into {len(_doc)} chunks")
    docs.extend(_doc)


  0%|          | 0/2 [00:00<?, ?it/s]

docsJuly\Official Statement posted 05-20-2025 (6.4 MB).pdf
Extracted 115 pages from docsJuly\Official Statement posted 05-20-2025 (6.4 MB).pdf
Split into 476 chunks
docsJuly\P11871362-P11431791-P11876664.pdf
Extracted 556 pages from docsJuly\P11871362-P11431791-P11876664.pdf
Split into 2357 chunks


In [35]:
docs[0].metadata

{'source': 'docsJuly\\Official Statement posted 05-20-2025 (6.4 MB).pdf',
 'file_path': 'docsJuly\\Official Statement posted 05-20-2025 (6.4 MB).pdf',
 'page': 0,
 'total_pages': 115,
 'Author': 'Trenise L. Lewis',
 'CreationDate': "D:20250519171640-05'00'",
 'Creator': 'Microsoft® Word for Microsoft 365',
 'ModDate': "D:20250519171912-05'00'",
 'Producer': 'Microsoft® Word for Microsoft 365',
 'Title': ''}

In [18]:
embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("vector_db")

In [22]:
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 4

In [23]:
hybrid_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vectorstore.as_retriever()],
    weights=[0.5, 0.5]
)

In [24]:
hybrid_retriever.invoke("find use of proceeds")

[Document(metadata={'source': 'docsJuly\\P11871362-P11431791-P11876664.pdf', 'file_path': 'docsJuly\\P11871362-P11431791-P11876664.pdf', 'page': 434, 'total_pages': 556, 'CreationDate': "D:20250702091603-04'00'", 'Creator': 'PScript5.dll Version 5.2.2', 'ModDate': "D:20250702190747-04'00'", 'Producer': 'Acrobat Distiller 25.0 (Windows)', 'Title': ''}, page_content='Arbitrage. The Authority will make no use of the proceeds of the Bonds or of any\nother amounts or property, regardless of the source, and will not take or omit any action, that would cause\nany Tax-Exempt Bonds to be “arbitrage bonds” within the meaning of Section 148 of the Code;\nFederal Guarantee. The Authority will make no use of the proceeds of any Tax-\nExempt Bonds and not take or omit to take any action that would cause such Tax-Exempt Bonds to be\n“federally guaranteed” within the meaning of Section 149(b) of the Code;\nInformation Reporting. The Authority will take or cause to be taken all necessary\naction to com

In [None]:

def extract_text_from_pdf( pdf_path: str) -> str:
    """Extract text from a single PDF file."""
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        return text
    except Exception as e:
        logger.error(f"Error extracting text from {pdf_path}: {e}")
        return ""

In [None]:

def vectorize_pdf(pdf_file):
    text = extract_text_from_pdf(pdf_file)
    documents = []
    if text.strip():
        chunks = text_splitter.split_text(text)
        for i, chunk in tqdm(enumerate(chunks), desc="chunks of PDF"):
            doc = Document(
                page_content=chunk,
                metadata={
                    # "source": str(pdf_file),
                    "filename": pdf_file.name,
                    "chunk_id": i
                }
            )
        documents.append(doc)
    return documents

all_documents = []
for pdf_file in tqdm(pdf_files, desc="PDFs"):
    documents = vectorize_pdf(pdf_file)
    all_documents.extend(documents)


PDFs:   0%|          | 0/2 [00:00<?, ?it/s]

chunks of PDF: 0it [00:00, ?it/s]

chunks of PDF: 0it [00:00, ?it/s]

In [50]:
all_documents[-1]

Document(metadata={'filename': 'P11871362-P11431791-P11876664.pdf', 'chunk_id': 2228}, page_content='securities laws and official interpretations thereof. \nTHE METROPOLITAN WATER DISTRICT \nOF SOUTHERN CALIFORNIA \nBy: ___________________________________ \nKatano Kasaine \nAssistant General Manager/ \nChief Financial Officer \nAPPROVED AS TO FORM: \nMARCIA SCULLY, General Counsel \nBy: ___________________________________ \n \n \n \n[ THIS PAGE INTENTIONALLY LEFT BLANK ]')

In [None]:
vector_db = FAISS.from_documents(all_documents, embeddings)

[]