In [10]:
import os
from unstructured.partition.pdf import partition_pdf
import json
from typing import Dict
from dotenv import load_dotenv
load_dotenv()
from tqdm import tqdm
load_dotenv()

True

In [13]:

dataset_path = "dataset/pdfs"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".pdf")]
datasets

['dataset/pdfs/sbi.pdf', 'dataset/pdfs/icici.pdf', 'dataset/pdfs/HDFC.pdf']

In [14]:
def extract_pdf_text_by_page(pdf_path: str) -> dict:
    """
    Extracts text from a PDF file page by page using Unstructured's partition_pdf.

    Args:
        pdf_path (str): Full path to the PDF file.

    Returns:
        dict: A dictionary where keys are page numbers (int) and values are strings of extracted text.

    Raises:
        FileNotFoundError: If the provided file path does not exist.
        ValueError: If extraction fails or no elements are returned.
        Exception: For any other unexpected errors.
    """
    import os

    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")

    try:
        elements = partition_pdf(
            filename=pdf_path,
            strategy=os.getenv("strategy") ,# faster, less accurate (use "hi_res" for better layout)
            infer_table_structure=os.getenv("infer_table_structure")
        )
        if not elements:
            raise ValueError(f"No extractable content found in {pdf_path}")

        pagewise_text = {}

        for el in elements:
            page_num = el.metadata.page_number or 0

            if page_num not in pagewise_text:
                pagewise_text[page_num] = []

            if el.text:
                clean_text = el.text.strip()
                if clean_text:  # skip empty lines
                    pagewise_text[page_num].append(clean_text)

        # Convert list of lines per page to single string
        return {
            page: "\n".join(lines)
            for page, lines in sorted(pagewise_text.items())
        }

    except Exception as e:
        raise Exception(f"Failed to extract text from {pdf_path}: {str(e)}")
    
    


def create_json(file_path: str, year: int, company: str, data: Dict[int, str]) -> str:
    """
    Creates a structured JSON file from page-wise extracted PDF text.

    Args:
        file_path (str): Path to the original PDF file.
        year (int): The year associated with the document.
        company (str): The company name associated with the document.
        data (Dict[int, str]): Dictionary mapping page numbers to text content.

    Returns:
        str: Path to the saved JSON file.

    Raises:
        ValueError: If data is not a dictionary or contains invalid content.
        Exception: For any file writing or JSON serialization errors.
    """
    if not isinstance(data, dict):
        raise ValueError("Expected `data` to be a dictionary of page_num -> text")

    try:
        final_text = []
        for page_num, text in data.items():
            if not isinstance(page_num, int):
                raise ValueError(f"Invalid page number: {page_num}")
            if not isinstance(text, str):
                raise ValueError(f"Invalid text for page {page_num}")
            final_text.append({
                "page_num": page_num,
                "content": text,
                "year": year,
                "company": company
            })

        # Create output directory if it doesn't exist
        output_dir = "dataset/json"
        os.makedirs(output_dir, exist_ok=True)

        # Build JSON file name from original PDF name
        output_path = os.path.join(
            output_dir,
            f"{os.path.basename(file_path).split('.')[0]}.json"
        )

        # Write to file
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(final_text, f, indent=4, ensure_ascii=False)

        print(f"✅ JSON saved to: {output_path}")
        return output_path

    except Exception as e:
        raise Exception(f"Failed to write JSON for {file_path}: {str(e)}")



def process_pdfs(datasets, year=2024):
    for path in tqdm(datasets):
        data = extract_pdf_text_by_page(path)
        company_name = os.path.basename(path).split(".")[0]
        create_json(path, year, company_name, data)
        

In [15]:

        
process_pdfs(datasets)


 33%|███▎      | 1/3 [01:18<02:37, 78.99s/it]

✅ JSON saved to: dataset/json/sbi.json


 67%|██████▋   | 2/3 [02:31<01:15, 75.11s/it]

✅ JSON saved to: dataset/json/icici.json


100%|██████████| 3/3 [03:26<00:00, 68.76s/it]

✅ JSON saved to: dataset/json/HDFC.json





# cleaning data

In [5]:
import os 
dataset_path = "dataset/json"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".json")]
datasets

['dataset/json/icici_2024.json',
 'dataset/json/sbi_2024.json',
 'dataset/json/HDFC_2024.json']

In [8]:
import re

import re

def clean_pdf_json_content(data: list) -> list:
    """
    Cleans the 'content' field in a list of dictionaries extracted from PDFs
    and adds a new key 'clean_content' with the cleaned version.

    Cleaning operations include:
    - Removing hyphenated line breaks
    - Collapsing multiple spaces
    - Stripping table borders
    - Removing page numbers
    - Normalizing whitespace

    Parameters:
        data (list): List of dictionaries, each expected to have a 'content' key with string value.

    Returns:
        list: The same list with an additional 'clean_content' key in each dictionary.
    """
    def clean_text(text: str) -> str:
        text = re.sub(r'-\n(\w+)', r'\1', text)                     # Fix hyphenated words
        text = re.sub(r'[ ]{2,}', ' ', text)                        # Collapse multiple spaces
        text = re.sub(r'[─═╚╩╝╔╦╗╠╣╬]+', '', text)                  # Remove table borders
        text = re.sub(r'^\s*(Page|PAGE)?\s*\d+\s*$', '', text, flags=re.MULTILINE)  # Remove page numbers
        text = re.sub(r'\n{2,}', '\n', text)                        # Remove excessive line breaks
        text = re.sub(r'\s+', ' ', text)                            # Normalize whitespace
        return text.strip()
    
    for idx, item in enumerate(data):
        try:
            if 'content' in item and isinstance(item['content'], str):
                item['clean_content'] = clean_text(item['content'])
            else:
                print(f"[WARN] Skipping index {idx}: Missing or non-string 'content'")
        except Exception as e:
            print(f"[ERROR] Failed to process index {idx}: {e}")

    return data



import json
from tqdm import tqdm
for files in tqdm(datasets):
    with open(files, 'r') as f:
        data = json.load(f)

    updated_data = clean_pdf_json_content(data)

    with open(files, 'w') as f:
        json.dump(updated_data, f, indent=2)



100%|██████████| 3/3 [00:00<00:00,  8.97it/s]


In [5]:
import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
load_dotenv()

class SummaryOutput(BaseModel):
    """Structured output format for the summarizer."""
    summary: str = Field(..., description="Concise summary of the input content.")

# Setup LLM with structured output
llm = ChatOpenAI(
    model=os.getenv("model_name"),
    base_url=os.getenv("base_url"),
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=float(os.getenv("TEMPERATURE", 0))
).with_structured_output(SummaryOutput)

# System message prompt with an input variable
system_prompt = """You are a precise and concise summarization agent.

Your goal is to summarize **any kind of text** — whether it’s a formal financial report, business update, meeting note, press release, or generic content. Your summaries should always be crisp, context-aware, and free of filler.

Rules:
1. If the input includes numbers (financial data, metrics, dates, percentages), **include them exactly** in the summary.
2. If the input contains financial insights, strategy, risks, or leadership commentary — **highlight those clearly**.
3. If the input is administrative or doesn't contain meaningful content, return:
   {{ "summary": "No substantive content available to summarize." }}
4. Do NOT infer or fabricate numbers, people, or insights that are not clearly present.
5. Always respond ONLY in the following JSON format:
   {{ "summary": "..." }}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input_text}")  # Add human input block here
])

input_text = """HDFC BANK LIMITED Registered Office: HDFC Bank House, Senapati Bapat Marg, Lower Parel (W), Mumbai 400 013. [CIN: L65920MH1994PLC080618] [E-Mail: shareholder.grievances@hdfcbank.com] [Website: www.hdfcbank.com] [Tel. Nos.: 022 6631 6000] NOTICE IS HEREBY GIVEN THAT THE THIRTIETH (30TH) ANNUAL GENERAL MEETING (AGM) OF THE MEMBERS OF HDFC BANK LIMITED (THE “BANK”) WILL BE HELD ON FRIDAY, AUGUST 9, 2024 AT 02:30 P.M. INDIAN STANDARD TIME (“IST”). THE AGM SHALL BE HELD BY MEANS OF VIDEO CONFERENCING (“VC”) / OTHER AUDIO-VISUAL MEANS (“OAVM”) IN ACCORDANCE WITH THE RELEVANT CIRCULARS ISSUED BY THE MINISTRY OF CORPORATE AFFAIRS, TO TRANSACT THE FOLLOWING BUSINESS: ORDINARY BUSINESS: 1. To receive, consider and adopt the audited financial statements (standalone) of the Bank for the financial year ended March 31, 2024 along with the Reports of the Board of Directors and Auditors thereon. 2. To receive, consider and adopt the audited financial statements (consolidated) of the Bank for the financial year ended March 31, 2024 along with the Report of Auditors thereon. 3. To consider declaration of dividend on Equity Shares. 4. To appoint a Director in place of Mr. Bhavesh Zaveri (DIN: 01550468), who retires by rotation and being eligible, offers himself for re-appointment. rules made thereunder and pursuant to Section 30 of the Banking Regulation Act, 1949 and the guidelines for Appointment of Statutory Central Auditors (SCAs)/ Statutory Auditors (SAs) of Commercial Banks (excluding RRBs), UCBs and NBFCs (including HFCs) dated April 27, 2021 (“Guidelines”) issued by the Reserve Bank of India (RBI) including any amendments, modifications, variations or re-enactments thereof (collectively “Applicable Laws”) and pursuant to the approval of the RBI dated May 30, 2024, M/s. Batliboi & Purohit, Chartered Accountants, (ICAI Firm Registration No. 101048W) (“Batliboi & Purohit”), who have offered themselves for appointment and have confirmed their eligibility to be appointed as one of the Joint Statutory Auditors in terms of Section 141 of the Act and applicable rules made thereunder and the Guidelines, be and are hereby appointed as one of the Joint Statutory Auditors of the Bank, to hold office for a period of 3 (three) years with effect from FY 2024-25 till and including FY 2026-27, subject to the approval of the RBI as and when required during this tenure, for the purpose of audit including reporting on internal financial controls of the Bank’s accounts at its head office, branches and other offices, with power to the Board of Directors (hereinafter referred to as the “Board”, which term shall be deemed to include any Committee(s) of the Board or any other persons to whom powers are delegated by the Board as permitted under the Act and/or rules made thereunder), to alter and vary the terms and conditions of appointment, and such other things including but not limited to reason of necessity on account of conditions as may be stipulated by the RBI and / or any other authority. 5. To appoint a director in place of Mr. Keki Mistry (DIN: 00008886), who retires by rotation and, being eligible, offers himself for re-appointment. 6. To appoint M/s. Batliboi & Purohit, Chartered Accountants as Joint Statutory Auditors and to fix the overall remuneration of the Joint Statutory Auditors and in this regard, to consider and if thought fit, to pass, the following resolution, as an Ordinary Resolution: “RESOLVED THAT, pursuant to the provisions of Sections 139, 141 and other applicable provisions, if any, of the Companies Act, 2013 (the “Act”) and the relevant RESOLVED FURTHER THAT subject to applicable laws and regulations including the relevant Guidelines and circulars of the RBI (as may be amended, restated, modified or, replaced from time to time) and pursuant to approval of the RBI in this regard received on May 30, 2024, M/s. Price Waterhouse LLP, Chartered Accountants (ICAI Firm Registration No. 301112E/ E300264) (‘Price Waterhouse LLP’) who were already appointed as one of the Joint Statutory Auditors of the Bank at the 28th 0 1"""

# Build and run the chain
chain = prompt | llm
response = chain.invoke({"input_text": input_text})

# Output
print(response)  # Since response is now a Pydantic model (SummaryOutput), no `.choices[0]` needed


summary='HDFC Bank Limited is scheduled to hold its 30th Annual General Meeting on August 9, 2024. The meeting will be conducted via video conferencing and will include several business matters such as adopting audited financial statements, considering dividend declarations, appointing directors, and selecting statutory auditors. Batliboi & Purohit and Price Waterhouse LLP have been appointed as joint statutory auditors for a three-year term starting from FY 2024-25. The meeting will also consider resolutions related to these appointments.'


In [2]:
import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
load_dotenv()

class SummaryOutput(BaseModel):
    """Structured output format for the summarizer."""
    summary: str = Field(..., description="Concise summary of the input content.")

# Setup LLM with structured output
llm = ChatOpenAI(
    model="llama3.2",
    base_url=os.getenv("base_url"),
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=float(os.getenv("TEMPERATURE", 0))
).with_structured_output(SummaryOutput)

# System message prompt with an input variable
system_prompt = """You are a precise and concise summarization agent.

Your goal is to summarize **any kind of text** — whether it’s a formal financial report, business update, meeting note, press release, or generic content. Your summaries should always be crisp, context-aware, and free of filler.

Rules:
1. If the input includes numbers (financial data, metrics, dates, percentages), **include them exactly** in the summary.
2. If the input contains financial insights, strategy, risks, or leadership commentary — **highlight those clearly**.
3. If the input is administrative or doesn't contain meaningful content, return:
   {{ "summary": "No substantive content available to summarize." }}
4. Do NOT infer or fabricate numbers, people, or insights that are not clearly present.
5. Always respond ONLY in the following JSON format:
   {{ "summary": "..." }}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input_text}")  # Add human input block here
])

input_text = """HDFC BANK LIMITED Registered Office: HDFC Bank House, Senapati Bapat Marg, Lower Parel (W), Mumbai 400 013. [CIN: L65920MH1994PLC080618] [E-Mail: shareholder.grievances@hdfcbank.com] [Website: www.hdfcbank.com] [Tel. Nos.: 022 6631 6000] NOTICE IS HEREBY GIVEN THAT THE THIRTIETH (30TH) ANNUAL GENERAL MEETING (AGM) OF THE MEMBERS OF HDFC BANK LIMITED (THE “BANK”) WILL BE HELD ON FRIDAY, AUGUST 9, 2024 AT 02:30 P.M. INDIAN STANDARD TIME (“IST”). THE AGM SHALL BE HELD BY MEANS OF VIDEO CONFERENCING (“VC”) / OTHER AUDIO-VISUAL MEANS (“OAVM”) IN ACCORDANCE WITH THE RELEVANT CIRCULARS ISSUED BY THE MINISTRY OF CORPORATE AFFAIRS, TO TRANSACT THE FOLLOWING BUSINESS: ORDINARY BUSINESS: 1. To receive, consider and adopt the audited financial statements (standalone) of the Bank for the financial year ended March 31, 2024 along with the Reports of the Board of Directors and Auditors thereon. 2. To receive, consider and adopt the audited financial statements (consolidated) of the Bank for the financial year ended March 31, 2024 along with the Report of Auditors thereon. 3. To consider declaration of dividend on Equity Shares. 4. To appoint a Director in place of Mr. Bhavesh Zaveri (DIN: 01550468), who retires by rotation and being eligible, offers himself for re-appointment. rules made thereunder and pursuant to Section 30 of the Banking Regulation Act, 1949 and the guidelines for Appointment of Statutory Central Auditors (SCAs)/ Statutory Auditors (SAs) of Commercial Banks (excluding RRBs), UCBs and NBFCs (including HFCs) dated April 27, 2021 (“Guidelines”) issued by the Reserve Bank of India (RBI) including any amendments, modifications, variations or re-enactments thereof (collectively “Applicable Laws”) and pursuant to the approval of the RBI dated May 30, 2024, M/s. Batliboi & Purohit, Chartered Accountants, (ICAI Firm Registration No. 101048W) (“Batliboi & Purohit”), who have offered themselves for appointment and have confirmed their eligibility to be appointed as one of the Joint Statutory Auditors in terms of Section 141 of the Act and applicable rules made thereunder and the Guidelines, be and are hereby appointed as one of the Joint Statutory Auditors of the Bank, to hold office for a period of 3 (three) years with effect from FY 2024-25 till and including FY 2026-27, subject to the approval of the RBI as and when required during this tenure, for the purpose of audit including reporting on internal financial controls of the Bank’s accounts at its head office, branches and other offices, with power to the Board of Directors (hereinafter referred to as the “Board”, which term shall be deemed to include any Committee(s) of the Board or any other persons to whom powers are delegated by the Board as permitted under the Act and/or rules made thereunder), to alter and vary the terms and conditions of appointment, and such other things including but not limited to reason of necessity on account of conditions as may be stipulated by the RBI and / or any other authority. 5. To appoint a director in place of Mr. Keki Mistry (DIN: 00008886), who retires by rotation and, being eligible, offers himself for re-appointment. 6. To appoint M/s. Batliboi & Purohit, Chartered Accountants as Joint Statutory Auditors and to fix the overall remuneration of the Joint Statutory Auditors and in this regard, to consider and if thought fit, to pass, the following resolution, as an Ordinary Resolution: “RESOLVED THAT, pursuant to the provisions of Sections 139, 141 and other applicable provisions, if any, of the Companies Act, 2013 (the “Act”) and the relevant RESOLVED FURTHER THAT subject to applicable laws and regulations including the relevant Guidelines and circulars of the RBI (as may be amended, restated, modified or, replaced from time to time) and pursuant to approval of the RBI in this regard received on May 30, 2024, M/s. Price Waterhouse LLP, Chartered Accountants (ICAI Firm Registration No. 301112E/ E300264) (‘Price Waterhouse LLP’) who were already appointed as one of the Joint Statutory Auditors of the Bank at the 28th 0 1"""

# Build and run the chain
chain = prompt | llm
response = chain.invoke({"input_text": input_text})

# Output
print(response)  # Since response is now a Pydantic model (SummaryOutput), no `.choices[0]` needed


summary="HDFC Bank Limited's 30th Annual General Meeting (AGM) will be held on Friday, August 9, 2024, at 02:30 PM IST via video conferencing. Key agenda items include adopting audited financial statements for FY 2024, declaring dividend on equity shares, appointing a director in place of Mr. Bhavesh Zaveri and Mr. Keki Mistry, and appointing M/s. Batliboi & Purohit as Joint Statutory Auditors with an overall remuneration to be determined."


In [3]:
response.summary

"HDFC Bank Limited's 30th Annual General Meeting (AGM) will be held on Friday, August 9, 2024, at 02:30 PM IST via video conferencing. Key agenda items include adopting audited financial statements for FY 2024, declaring dividend on equity shares, appointing a director in place of Mr. Bhavesh Zaveri and Mr. Keki Mistry, and appointing M/s. Batliboi & Purohit as Joint Statutory Auditors with an overall remuneration to be determined."

# Faiss

In [None]:
from sentence_transformers import SentenceTransformer
import faiss, pickle
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".json")]
all_chunks = []
for fpath in datasets:
    with open(fpath) as f:
        all_chunks.extend(json.load(f))

model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [chunk["summarized"] for chunk in all_chunks]
metas = all_chunks

embeddings = model.encode(texts, show_progress_bar=True)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

faiss.write_index(index, "vector_db/financial.index")
with open("vector_db/financial_meta.pkl", "wb") as f:
    pickle.dump(metas, f)


  from .autonotebook import tqdm as notebook_tqdm


KeyError: 'summary'

In [None]:
from pinecone import Pinecone

pc = Pinecone(api_key=os.getenv("pincone_api"))
index = pc.Index("financial-rag")

In [37]:
from pinecone import Pinecone
import pinecone
from sentence_transformers import SentenceTransformer
import uuid
import logging
from typing import List, Dict
import os
import json

dataset_path = "dataset/json"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".json")]
all_chunks = []
for fpath in datasets:
    with open(fpath) as f:
        all_chunks.extend(json.load(f))

pc = Pinecone(api_key=os.getenv("pincone_api"), host = os.getenv("pincone_host"))

def create_index_if_not_exists(index_name: str, dimension: int):
    # Check if the index exists and create it if not
    if index_name not in pc.list_indexes():
        print(f"Index '{index_name}' not found. Creating index...")
        pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec={
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    }
)
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Index '{index_name}' already exists.")



def upsert_to_pinecone(
    chunks: List[Dict],

    index_name: str = "financial-rag",
    embedding_model_name: str = "all-MiniLM-L6-v2"
) -> None:
    """
    Embeds and upserts chunked data to Pinecone with metadata.

    Args:
        chunks (List[Dict]): List of dicts with keys like 'company', 'year', 'page', 'summary', and 'clean_content'.
        pinecone_api_key (str): Your Pinecone API key.
        pinecone_env (str): Your Pinecone environment region (e.g., "gcp-starter").
        index_name (str): Name of the Pinecone index to use (default: "financial-rag").
        embedding_model_name (str): Sentence transformer model to use for embedding (default: MiniLM).

    Returns:
        None
    """

    try:
        print(pc.list_indexes())
        # Load index
            # First, ensure the index exists
        create_index_if_not_exists(index_name, dimension=348)  # Match the dimension of the embedding model


        index = pc.Index(index_name)

        # Load model
        model = SentenceTransformer(embedding_model_name)

        vectors = []
        for chunk in tqdm(chunks):
            try:
                vector = model.encode(chunk["summarized"]).tolist()

                metadata = {
                    "company": chunk.get("company", chunk['company']),
                    "year": chunk.get("year", chunk['year']),
                    "page": chunk.get("page",chunk['page_num']),
                    "summarized": chunk.get("summarized", ""),
                    "content": chunk.get("clean_content", "")
                }

                vectors.append((str(uuid.uuid4()), vector, metadata))
            except Exception as e:
                print(f"Failed to process chunk on page {chunk.get('page', '?')}: {e}")

        # Batch upsert (in chunks of 100 for large datasets)
        batch_size = 100
        for i in range(0, len(vectors), batch_size):
            batch = vectors[i:i + batch_size]
            index.upsert(vectors=batch)
            print(f"Upserted {i + len(batch)}/{len(vectors)} vectors into Pinecone")

        logging.info("✅ All vectors successfully upserted into Pinecone.")

    except Exception as e:
        print(f"❌ Pinecone upsert failed: {e}")
        raise e

upsert_to_pinecone(all_chunks)

[]
Index 'financial-rag' not found. Creating index...
Index 'financial-rag' created successfully.


100%|██████████| 1557/1557 [00:19<00:00, 80.13it/s]


Upserted 100/1557 vectors into Pinecone
Upserted 200/1557 vectors into Pinecone
Upserted 300/1557 vectors into Pinecone
Upserted 400/1557 vectors into Pinecone
Upserted 500/1557 vectors into Pinecone
Upserted 600/1557 vectors into Pinecone
Upserted 700/1557 vectors into Pinecone
Upserted 800/1557 vectors into Pinecone
Upserted 900/1557 vectors into Pinecone
Upserted 1000/1557 vectors into Pinecone
Upserted 1100/1557 vectors into Pinecone
Upserted 1200/1557 vectors into Pinecone
Upserted 1300/1557 vectors into Pinecone
Upserted 1400/1557 vectors into Pinecone
Upserted 1500/1557 vectors into Pinecone
Upserted 1557/1557 vectors into Pinecone


In [None]:
from  pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from typing import List, Dict
import os
pc = Pinecone(api_key=os.getenv("pincone_api"), host = os.getenv("pincone_host"))
def query_pinecone(
    query: str,
    index_name: str = "financial-rag",
    top_k: int = 5,
    filter_by: Dict = None,
    embedding_model: str = "all-MiniLM-L6-v2"
) -> List[Dict]:
    """
    Query Pinecone vector DB using semantic similarity and optional filters.

    Args:
        query (str): User's natural language question.
        api_key (str): Pinecone API key.
        host (str): Pinecone serverless host URL.
        index_name (str): Pinecone index name.
        top_k (int): Number of results to retrieve.
        filter_by (Dict): Optional metadata filter (e.g., {"company": "HDFC", "year": "2023-2024"}).
        embedding_model (str): SentenceTransformer model to use.

    Returns:
        List[Dict]: Retrieved results with metadata and similarity score.
    """

    try:
        # Initialize Pinecone
        
        index = pc.Index(index_name)

        # Load embedding model
        model = SentenceTransformer(embedding_model)
        query_vector = model.encode(query).tolist()

        # Build query payload
        query_args = {
            "vector": query_vector,
            "top_k": top_k,
            "include_metadata": True
        }
        if filter_by:
            query_args["filter"] = filter_by

        # Execute query
        results = index.query(**query_args)

        return results.get("matches", [])

    except Exception as e:
        print(f"❌ Query failed: {e}")
        return []

In [39]:

query = "What is SBI’s capital adequacy ratio in 2024?"
data = query_pinecone(query=query)
for r in data:
    print(f"\n✅ Match | Score: {r['score']:.3f}")
    print(f"Company: {r['metadata']['company']}, Year: {r['metadata']['year']}, Page: {r['metadata']['page']}")
    print(f"Snippet:\n{r['metadata']['content'][:500]}...")


✅ Match | Score: 0.681
Company: sbi, Year: 2024.0, Page: 320.0
Snippet:
Company Overview Responsible Approach Governance Statutory Reports Financial Statements (e) Common Equity Tier 1, Tier 1 and Total Capital Ratios: For the top consolidated group; and For significant bank subsidiaries (stand alone or sub-consolidated depending on how the Framework is applied) Capital adequacy ratios as on 31st March 2024 CET 1 (%) Tier 1 (%) Total (%) SBI Group 10.57 12.06 14.38 State Bank of India 10.36 11.93 14.28 SBI (Mauritius) Ltd. 19.24 19.24 20.28 State Bank of India (Cana...

✅ Match | Score: 0.596
Company: sbi, Year: 2024.0, Page: 400.0
Snippet:
SBI’s Economic Performance FY 24 (INR Crore) FY 23 (INR Crore) FY 22 (INR Crore) Payments to government (net cash outgo on amount of corporate income tax) 25,173 12,677 3,529 Community investment Total economic value distributed 1,55,663 1,20,821 96,049 Economic value retained Total economic value retained 3,11,150 2,47,898 2,19,972 Bank’s Financia

In [42]:

query = "How much  equity shares  paid to employees paid by HDFC in 2024?"
data = query_pinecone(query=query)
for r in data:
    print(f"\n✅ Match | Score: {r['score']:.3f}")
    print(f"Company: {r['metadata']['company']}, Year: {r['metadata']['year']}, Page: {r['metadata']['page']}")
    print(f"Snippet:\n{r['metadata']['content'][:500]}...")


✅ Match | Score: 0.683
Company: HDFC, Year: 2024.0, Page: 542.0
Snippet:
Overview Introduction Our Performance How We Create Value Our Strategy Responsible Business Statutory Reports and Financial Statements 41,05,375 folios comprising of 758,92,48,751 equity shares forming 99. 90% of the share capital are in demat form. 16,440 folios comprising of 76,61,911 equity shares forming 0.10 % of the share capital are in physical form. The shares of the Bank are widely traded on the stock exchanges. SHARES LYING ACCOUNT Particulars IN UNCLAIMED SUSPENSE Records / No. of sha...

✅ Match | Score: 0.682
Company: HDFC, Year: 2024.0, Page: 272.0
Snippet:
Overview Introduction Our Performance How We Create Value Our Strategy Responsible Business Statutory Reports and Financial Statements During the year ended March 31, 2024, the fees paid to the Joint Statutory Auditor(s) and their respective network firms on aggregated basis are as follows: (` In crores) Fees (excluding taxes)* Statutory Audit Ce

In [41]:

query = "Bank of announces ESOS 2024 plan to issue up to 9,50,00,000 fully paid-up equit ?"
data = query_pinecone(query=query)
for r in data:
    print(f"\n✅ Match | Score: {r['score']:.3f}")
    print(f"Company: {r['metadata']['company']}, Year: {r['metadata']['year']}, Page: {r['metadata']['page']}")
    print(f"Snippet:\n{r['metadata']['content'][:500]}...")


✅ Match | Score: 0.747
Company: HDFC, Year: 2024.0, Page: 20.0
Snippet:
service, future potential contribution and conduct of the employee and such other relevant factors as may be deemed appropriate by it. holders to subscribe to an aggregate of 9,50,00,000 (Nine Crore and Fifty Lakhs) fully paid-up equity shares of Bank of the face value of ` 1/- (Rupee One) each. I. MAXIMUM NUMBER OF ESOPs TO BE OFFERED / ISSUED PER EMPLOYEE AND IN AGGREGATE Maximum of 9,50,00,000 (Nine Crore and Fifty Lakhs) ESOPs or such adjusted numbers for any bonus issues of Equity Shares or...

✅ Match | Score: 0.673
Company: HDFC, Year: 2024.0, Page: 21.0
Snippet:
Employees, or as per the code of conduct of the Bank or as per the employment contracts/terms, in accordance with the RBI Guidelines on Compensation of Whole Time Directors / Chief Executive Officers / Material Risk Takers and Control Function Staff dated November 4, 2019, as may be amended, replaced, substituted, restated from time to time. Mr. Sa