In [None]:
import os
from unstructured.partition.pdf import partition_pdf
import json
from typing import Dict
from dotenv import load_dotenv
load_dotenv()
from tqdm import tqdm
load_dotenv()

True

In [13]:

dataset_path = "dataset/pdfs"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".pdf")]
datasets

['dataset/pdfs/sbi.pdf', 'dataset/pdfs/icici.pdf', 'dataset/pdfs/HDFC.pdf']

In [14]:
def extract_pdf_text_by_page(pdf_path: str) -> dict:
    """
    Extracts text from a PDF file page by page using Unstructured's partition_pdf.

    Args:
        pdf_path (str): Full path to the PDF file.

    Returns:
        dict: A dictionary where keys are page numbers (int) and values are strings of extracted text.

    Raises:
        FileNotFoundError: If the provided file path does not exist.
        ValueError: If extraction fails or no elements are returned.
        Exception: For any other unexpected errors.
    """
    import os

    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"File not found: {pdf_path}")

    try:
        elements = partition_pdf(
            filename=pdf_path,
            strategy=os.getenv("strategy") ,# faster, less accurate (use "hi_res" for better layout)
            infer_table_structure=os.getenv("infer_table_structure")
        )
        if not elements:
            raise ValueError(f"No extractable content found in {pdf_path}")

        pagewise_text = {}

        for el in elements:
            page_num = el.metadata.page_number or 0

            if page_num not in pagewise_text:
                pagewise_text[page_num] = []

            if el.text:
                clean_text = el.text.strip()
                if clean_text:  # skip empty lines
                    pagewise_text[page_num].append(clean_text)

        # Convert list of lines per page to single string
        return {
            page: "\n".join(lines)
            for page, lines in sorted(pagewise_text.items())
        }

    except Exception as e:
        raise Exception(f"Failed to extract text from {pdf_path}: {str(e)}")
    
    


def create_json(file_path: str, year: int, company: str, data: Dict[int, str]) -> str:
    """
    Creates a structured JSON file from page-wise extracted PDF text.

    Args:
        file_path (str): Path to the original PDF file.
        year (int): The year associated with the document.
        company (str): The company name associated with the document.
        data (Dict[int, str]): Dictionary mapping page numbers to text content.

    Returns:
        str: Path to the saved JSON file.

    Raises:
        ValueError: If data is not a dictionary or contains invalid content.
        Exception: For any file writing or JSON serialization errors.
    """
    if not isinstance(data, dict):
        raise ValueError("Expected `data` to be a dictionary of page_num -> text")

    try:
        final_text = []
        for page_num, text in data.items():
            if not isinstance(page_num, int):
                raise ValueError(f"Invalid page number: {page_num}")
            if not isinstance(text, str):
                raise ValueError(f"Invalid text for page {page_num}")
            final_text.append({
                "page_num": page_num,
                "content": text,
                "year": year,
                "company": company
            })

        # Create output directory if it doesn't exist
        output_dir = "dataset/json"
        os.makedirs(output_dir, exist_ok=True)

        # Build JSON file name from original PDF name
        output_path = os.path.join(
            output_dir,
            f"{os.path.basename(file_path).split('.')[0]}.json"
        )

        # Write to file
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(final_text, f, indent=4, ensure_ascii=False)

        print(f"✅ JSON saved to: {output_path}")
        return output_path

    except Exception as e:
        raise Exception(f"Failed to write JSON for {file_path}: {str(e)}")



def process_pdfs(datasets, year=2024):
    for path in tqdm(datasets):
        data = extract_pdf_text_by_page(path)
        company_name = os.path.basename(path).split(".")[0]
        create_json(path, year, company_name, data)
        

In [15]:

        
process_pdfs(datasets)


 33%|███▎      | 1/3 [01:18<02:37, 78.99s/it]

✅ JSON saved to: dataset/json/sbi.json


 67%|██████▋   | 2/3 [02:31<01:15, 75.11s/it]

✅ JSON saved to: dataset/json/icici.json


100%|██████████| 3/3 [03:26<00:00, 68.76s/it]

✅ JSON saved to: dataset/json/HDFC.json





# cleaning data

In [5]:
import os 
dataset_path = "dataset/json"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".json")]
datasets

['dataset/json/icici_2024.json',
 'dataset/json/sbi_2024.json',
 'dataset/json/HDFC_2024.json']

In [8]:
import re

import re

def clean_pdf_json_content(data: list) -> list:
    """
    Cleans the 'content' field in a list of dictionaries extracted from PDFs
    and adds a new key 'clean_content' with the cleaned version.

    Cleaning operations include:
    - Removing hyphenated line breaks
    - Collapsing multiple spaces
    - Stripping table borders
    - Removing page numbers
    - Normalizing whitespace

    Parameters:
        data (list): List of dictionaries, each expected to have a 'content' key with string value.

    Returns:
        list: The same list with an additional 'clean_content' key in each dictionary.
    """
    def clean_text(text: str) -> str:
        text = re.sub(r'-\n(\w+)', r'\1', text)                     # Fix hyphenated words
        text = re.sub(r'[ ]{2,}', ' ', text)                        # Collapse multiple spaces
        text = re.sub(r'[─═╚╩╝╔╦╗╠╣╬]+', '', text)                  # Remove table borders
        text = re.sub(r'^\s*(Page|PAGE)?\s*\d+\s*$', '', text, flags=re.MULTILINE)  # Remove page numbers
        text = re.sub(r'\n{2,}', '\n', text)                        # Remove excessive line breaks
        text = re.sub(r'\s+', ' ', text)                            # Normalize whitespace
        return text.strip()
    
    for idx, item in enumerate(data):
        try:
            if 'content' in item and isinstance(item['content'], str):
                item['clean_content'] = clean_text(item['content'])
            else:
                print(f"[WARN] Skipping index {idx}: Missing or non-string 'content'")
        except Exception as e:
            print(f"[ERROR] Failed to process index {idx}: {e}")

    return data



import json
from tqdm import tqdm
for files in tqdm(datasets):
    with open(files, 'r') as f:
        data = json.load(f)

    updated_data = clean_pdf_json_content(data)

    with open(files, 'w') as f:
        json.dump(updated_data, f, indent=2)



100%|██████████| 3/3 [00:00<00:00,  8.97it/s]


In [5]:
import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
load_dotenv()

class SummaryOutput(BaseModel):
    """Structured output format for the summarizer."""
    summary: str = Field(..., description="Concise summary of the input content.")

# Setup LLM with structured output
llm = ChatOpenAI(
    model=os.getenv("model_name"),
    base_url=os.getenv("base_url"),
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=float(os.getenv("TEMPERATURE", 0))
).with_structured_output(SummaryOutput)

# System message prompt with an input variable
system_prompt = """You are a precise and concise summarization agent.

Your goal is to summarize **any kind of text** — whether it’s a formal financial report, business update, meeting note, press release, or generic content. Your summaries should always be crisp, context-aware, and free of filler.

Rules:
1. If the input includes numbers (financial data, metrics, dates, percentages), **include them exactly** in the summary.
2. If the input contains financial insights, strategy, risks, or leadership commentary — **highlight those clearly**.
3. If the input is administrative or doesn't contain meaningful content, return:
   {{ "summary": "No substantive content available to summarize." }}
4. Do NOT infer or fabricate numbers, people, or insights that are not clearly present.
5. Always respond ONLY in the following JSON format:
   {{ "summary": "..." }}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input_text}")  # Add human input block here
])

input_text = """HDFC BANK LIMITED Registered Office: HDFC Bank House, Senapati Bapat Marg, Lower Parel (W), Mumbai 400 013. [CIN: L65920MH1994PLC080618] [E-Mail: shareholder.grievances@hdfcbank.com] [Website: www.hdfcbank.com] [Tel. Nos.: 022 6631 6000] NOTICE IS HEREBY GIVEN THAT THE THIRTIETH (30TH) ANNUAL GENERAL MEETING (AGM) OF THE MEMBERS OF HDFC BANK LIMITED (THE “BANK”) WILL BE HELD ON FRIDAY, AUGUST 9, 2024 AT 02:30 P.M. INDIAN STANDARD TIME (“IST”). THE AGM SHALL BE HELD BY MEANS OF VIDEO CONFERENCING (“VC”) / OTHER AUDIO-VISUAL MEANS (“OAVM”) IN ACCORDANCE WITH THE RELEVANT CIRCULARS ISSUED BY THE MINISTRY OF CORPORATE AFFAIRS, TO TRANSACT THE FOLLOWING BUSINESS: ORDINARY BUSINESS: 1. To receive, consider and adopt the audited financial statements (standalone) of the Bank for the financial year ended March 31, 2024 along with the Reports of the Board of Directors and Auditors thereon. 2. To receive, consider and adopt the audited financial statements (consolidated) of the Bank for the financial year ended March 31, 2024 along with the Report of Auditors thereon. 3. To consider declaration of dividend on Equity Shares. 4. To appoint a Director in place of Mr. Bhavesh Zaveri (DIN: 01550468), who retires by rotation and being eligible, offers himself for re-appointment. rules made thereunder and pursuant to Section 30 of the Banking Regulation Act, 1949 and the guidelines for Appointment of Statutory Central Auditors (SCAs)/ Statutory Auditors (SAs) of Commercial Banks (excluding RRBs), UCBs and NBFCs (including HFCs) dated April 27, 2021 (“Guidelines”) issued by the Reserve Bank of India (RBI) including any amendments, modifications, variations or re-enactments thereof (collectively “Applicable Laws”) and pursuant to the approval of the RBI dated May 30, 2024, M/s. Batliboi & Purohit, Chartered Accountants, (ICAI Firm Registration No. 101048W) (“Batliboi & Purohit”), who have offered themselves for appointment and have confirmed their eligibility to be appointed as one of the Joint Statutory Auditors in terms of Section 141 of the Act and applicable rules made thereunder and the Guidelines, be and are hereby appointed as one of the Joint Statutory Auditors of the Bank, to hold office for a period of 3 (three) years with effect from FY 2024-25 till and including FY 2026-27, subject to the approval of the RBI as and when required during this tenure, for the purpose of audit including reporting on internal financial controls of the Bank’s accounts at its head office, branches and other offices, with power to the Board of Directors (hereinafter referred to as the “Board”, which term shall be deemed to include any Committee(s) of the Board or any other persons to whom powers are delegated by the Board as permitted under the Act and/or rules made thereunder), to alter and vary the terms and conditions of appointment, and such other things including but not limited to reason of necessity on account of conditions as may be stipulated by the RBI and / or any other authority. 5. To appoint a director in place of Mr. Keki Mistry (DIN: 00008886), who retires by rotation and, being eligible, offers himself for re-appointment. 6. To appoint M/s. Batliboi & Purohit, Chartered Accountants as Joint Statutory Auditors and to fix the overall remuneration of the Joint Statutory Auditors and in this regard, to consider and if thought fit, to pass, the following resolution, as an Ordinary Resolution: “RESOLVED THAT, pursuant to the provisions of Sections 139, 141 and other applicable provisions, if any, of the Companies Act, 2013 (the “Act”) and the relevant RESOLVED FURTHER THAT subject to applicable laws and regulations including the relevant Guidelines and circulars of the RBI (as may be amended, restated, modified or, replaced from time to time) and pursuant to approval of the RBI in this regard received on May 30, 2024, M/s. Price Waterhouse LLP, Chartered Accountants (ICAI Firm Registration No. 301112E/ E300264) (‘Price Waterhouse LLP’) who were already appointed as one of the Joint Statutory Auditors of the Bank at the 28th 0 1"""

# Build and run the chain
chain = prompt | llm
response = chain.invoke({"input_text": input_text})

# Output
print(response)  # Since response is now a Pydantic model (SummaryOutput), no `.choices[0]` needed


summary='HDFC Bank Limited is scheduled to hold its 30th Annual General Meeting on August 9, 2024. The meeting will be conducted via video conferencing and will include several business matters such as adopting audited financial statements, considering dividend declarations, appointing directors, and selecting statutory auditors. Batliboi & Purohit and Price Waterhouse LLP have been appointed as joint statutory auditors for a three-year term starting from FY 2024-25. The meeting will also consider resolutions related to these appointments.'


In [2]:
import os
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
load_dotenv()

class SummaryOutput(BaseModel):
    """Structured output format for the summarizer."""
    summary: str = Field(..., description="Concise summary of the input content.")

# Setup LLM with structured output
llm = ChatOpenAI(
    model="llama3.2",
    base_url=os.getenv("base_url"),
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=float(os.getenv("TEMPERATURE", 0))
).with_structured_output(SummaryOutput)

# System message prompt with an input variable
system_prompt = """You are a precise and concise summarization agent.

Your goal is to summarize **any kind of text** — whether it’s a formal financial report, business update, meeting note, press release, or generic content. Your summaries should always be crisp, context-aware, and free of filler.

Rules:
1. If the input includes numbers (financial data, metrics, dates, percentages), **include them exactly** in the summary.
2. If the input contains financial insights, strategy, risks, or leadership commentary — **highlight those clearly**.
3. If the input is administrative or doesn't contain meaningful content, return:
   {{ "summary": "No substantive content available to summarize." }}
4. Do NOT infer or fabricate numbers, people, or insights that are not clearly present.
5. Always respond ONLY in the following JSON format:
   {{ "summary": "..." }}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input_text}")  # Add human input block here
])

input_text = """HDFC BANK LIMITED Registered Office: HDFC Bank House, Senapati Bapat Marg, Lower Parel (W), Mumbai 400 013. [CIN: L65920MH1994PLC080618] [E-Mail: shareholder.grievances@hdfcbank.com] [Website: www.hdfcbank.com] [Tel. Nos.: 022 6631 6000] NOTICE IS HEREBY GIVEN THAT THE THIRTIETH (30TH) ANNUAL GENERAL MEETING (AGM) OF THE MEMBERS OF HDFC BANK LIMITED (THE “BANK”) WILL BE HELD ON FRIDAY, AUGUST 9, 2024 AT 02:30 P.M. INDIAN STANDARD TIME (“IST”). THE AGM SHALL BE HELD BY MEANS OF VIDEO CONFERENCING (“VC”) / OTHER AUDIO-VISUAL MEANS (“OAVM”) IN ACCORDANCE WITH THE RELEVANT CIRCULARS ISSUED BY THE MINISTRY OF CORPORATE AFFAIRS, TO TRANSACT THE FOLLOWING BUSINESS: ORDINARY BUSINESS: 1. To receive, consider and adopt the audited financial statements (standalone) of the Bank for the financial year ended March 31, 2024 along with the Reports of the Board of Directors and Auditors thereon. 2. To receive, consider and adopt the audited financial statements (consolidated) of the Bank for the financial year ended March 31, 2024 along with the Report of Auditors thereon. 3. To consider declaration of dividend on Equity Shares. 4. To appoint a Director in place of Mr. Bhavesh Zaveri (DIN: 01550468), who retires by rotation and being eligible, offers himself for re-appointment. rules made thereunder and pursuant to Section 30 of the Banking Regulation Act, 1949 and the guidelines for Appointment of Statutory Central Auditors (SCAs)/ Statutory Auditors (SAs) of Commercial Banks (excluding RRBs), UCBs and NBFCs (including HFCs) dated April 27, 2021 (“Guidelines”) issued by the Reserve Bank of India (RBI) including any amendments, modifications, variations or re-enactments thereof (collectively “Applicable Laws”) and pursuant to the approval of the RBI dated May 30, 2024, M/s. Batliboi & Purohit, Chartered Accountants, (ICAI Firm Registration No. 101048W) (“Batliboi & Purohit”), who have offered themselves for appointment and have confirmed their eligibility to be appointed as one of the Joint Statutory Auditors in terms of Section 141 of the Act and applicable rules made thereunder and the Guidelines, be and are hereby appointed as one of the Joint Statutory Auditors of the Bank, to hold office for a period of 3 (three) years with effect from FY 2024-25 till and including FY 2026-27, subject to the approval of the RBI as and when required during this tenure, for the purpose of audit including reporting on internal financial controls of the Bank’s accounts at its head office, branches and other offices, with power to the Board of Directors (hereinafter referred to as the “Board”, which term shall be deemed to include any Committee(s) of the Board or any other persons to whom powers are delegated by the Board as permitted under the Act and/or rules made thereunder), to alter and vary the terms and conditions of appointment, and such other things including but not limited to reason of necessity on account of conditions as may be stipulated by the RBI and / or any other authority. 5. To appoint a director in place of Mr. Keki Mistry (DIN: 00008886), who retires by rotation and, being eligible, offers himself for re-appointment. 6. To appoint M/s. Batliboi & Purohit, Chartered Accountants as Joint Statutory Auditors and to fix the overall remuneration of the Joint Statutory Auditors and in this regard, to consider and if thought fit, to pass, the following resolution, as an Ordinary Resolution: “RESOLVED THAT, pursuant to the provisions of Sections 139, 141 and other applicable provisions, if any, of the Companies Act, 2013 (the “Act”) and the relevant RESOLVED FURTHER THAT subject to applicable laws and regulations including the relevant Guidelines and circulars of the RBI (as may be amended, restated, modified or, replaced from time to time) and pursuant to approval of the RBI in this regard received on May 30, 2024, M/s. Price Waterhouse LLP, Chartered Accountants (ICAI Firm Registration No. 301112E/ E300264) (‘Price Waterhouse LLP’) who were already appointed as one of the Joint Statutory Auditors of the Bank at the 28th 0 1"""

# Build and run the chain
chain = prompt | llm
response = chain.invoke({"input_text": input_text})

# Output
print(response)  # Since response is now a Pydantic model (SummaryOutput), no `.choices[0]` needed


summary="HDFC Bank Limited's 30th Annual General Meeting (AGM) will be held on Friday, August 9, 2024, at 02:30 PM IST via video conferencing. Key agenda items include adopting audited financial statements for FY 2024, declaring dividend on equity shares, appointing a director in place of Mr. Bhavesh Zaveri and Mr. Keki Mistry, and appointing M/s. Batliboi & Purohit as Joint Statutory Auditors with an overall remuneration to be determined."


In [3]:
response.summary

"HDFC Bank Limited's 30th Annual General Meeting (AGM) will be held on Friday, August 9, 2024, at 02:30 PM IST via video conferencing. Key agenda items include adopting audited financial statements for FY 2024, declaring dividend on equity shares, appointing a director in place of Mr. Bhavesh Zaveri and Mr. Keki Mistry, and appointing M/s. Batliboi & Purohit as Joint Statutory Auditors with an overall remuneration to be determined."

KeyboardInterrupt: 

In [None]:
import json
dataset_path = "dataset/json"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".json")]

all_chunks = []
for fpath in datasets:
    with open(fpath) as f:
        all_chunks.extend(json.load(f))


FileNotFoundError: [Errno 2] No such file or directory: 'data/hdfc_2024.json'