In [8]:
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter

# ---- 1. Extract Text from PDF using PyMuPDF ---- #
def extract_text_blocks(pdf_path):
    """
    Extracts block-wise text from each page of the PDF for structured processing.
    Returns a list of dictionaries with page number and text.
    """
    doc = fitz.open(pdf_path)
    full_text = []
    for i, page in enumerate(doc):
        blocks = page.get_text("blocks")  # Gets structured blocks of text
        # Sort by Y (top-down), then X (left-right)
        blocks = sorted(blocks, key=lambda b: (b[1], b[0]))
        page_text = "\n".join([b[4].strip() for b in blocks if b[4].strip()])
        full_text.append({"page_num": i+1, "text": page_text})
    return full_text


# ---- 2. Chunk Text using RecursiveCharacterTextSplitter ---- #
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,           # Target token size for each chunk
    chunk_overlap=100,        # Overlap for context preservation
    separators=["\n\n", "\n", ".", " ", ""]  # Priority of split points
)

def chunk_pdf_text(pages, bank_name, year):
    """
    Chunks extracted PDF text with metadata for each chunk.
    Metadata includes page number, bank name, year, and chunk ID.
    """
    chunks = []
    for p in pages:
        text_chunks = text_splitter.split_text(p["text"])
        for i, chunk in enumerate(text_chunks):
            chunks.append({
                "content": chunk,
                "metadata": {
                    "page": p["page_num"],
                    "bank": bank_name,
                    "year": year,
                    "chunk_id": f"{bank_name}_{year}_p{p['page_num']}_c{i}"
                }
            })
    return chunks


In [9]:
import os
dataset_path = "dataset/pdfs"
datasets = [os.path.join(dataset_path, i) for i in os.listdir(dataset_path) if i.endswith(".pdf")]
print(datasets[0])


dataset/pdfs/sbi.pdf


In [10]:
pages = extract_text_blocks(datasets[0])
d = chunk_pdf_text(pages, "sbi", "2024")

In [12]:
print("The Listing Department, BSE Limited, Phiroje Jeejeebhoy Towers, 25th Floor, Dalal Street, Mumbai – 400001\nThe Listing Department, National Stock Exchange of India Limited, Exchange Plaza, 5th Floor, C / 1, ‘G’ Block, Bandra Kurla Complex, Bandra (East), Mumbai – 400051\nBSE SCRIP Code: 500112\nNSE SCRIP Code: SBIN\nCC/S&B/AND/2024-25/130\n27.05.2024\nMadam/ Sir,\nDisclosure under Regulation 34 (1) of SEBI (LODR) Regulations, 2015: Submission of Annual Report of the Bank for the financial year 2023-24\nPursuant to Regulation 34 (1) and other applicable provisions of the SEBI (Listing Obligations and Disclosure Requirements) Regulations, 2015, we submit the copy of Annual Report of the Bank for the FY 2023-24 including Business Responsibility and Sustainability Report (BRSR), along with Notice of 69th Annual General Meeting (AGM) of the Bank scheduled to be held on Wednesday, the 19th June, 2024 at 3.00 P.M. through Video Conferencing (VC) / Other Audio-Visual Means (OAVM).\nThe Ministry of Corporate Affairs (“MCA”) and the Securities and Exchange Board of India (“SEBI”) have granted exemptions regarding the requirement to send physical copy of the annual report and notice of meeting to shareholders, through their respective Circulars. Thus, the Annual Report for FY 2023-24 including BRSR and the Notice of 69th AGM is being sent through electronic mode today to those Members whose e-mail addresses are registered with the Bank/ Registrar and Transfer Agent / Depositories.\nThe Annual Report for FY 2023-24 is also uploaded on the Bank’s website and can be accessed at https://sbi.co.in/web/investor-relations/annual-report.\nThis is for your information and record.\nYours faithfully,\n(Aruna N Dak) DGM (Compliance & Company Secretary) Encl: A/a",)

The Listing Department, BSE Limited, Phiroje Jeejeebhoy Towers, 25th Floor, Dalal Street, Mumbai – 400001
The Listing Department, National Stock Exchange of India Limited, Exchange Plaza, 5th Floor, C / 1, ‘G’ Block, Bandra Kurla Complex, Bandra (East), Mumbai – 400051
BSE SCRIP Code: 500112
NSE SCRIP Code: SBIN
CC/S&B/AND/2024-25/130
27.05.2024
Madam/ Sir,
Disclosure under Regulation 34 (1) of SEBI (LODR) Regulations, 2015: Submission of Annual Report of the Bank for the financial year 2023-24
Pursuant to Regulation 34 (1) and other applicable provisions of the SEBI (Listing Obligations and Disclosure Requirements) Regulations, 2015, we submit the copy of Annual Report of the Bank for the FY 2023-24 including Business Responsibility and Sustainability Report (BRSR), along with Notice of 69th Annual General Meeting (AGM) of the Bank scheduled to be held on Wednesday, the 19th June, 2024 at 3.00 P.M. through Video Conferencing (VC) / Other Audio-Visual Means (OAVM).
The Ministry of Corp

In [18]:
d

[{'content': 'The Listing Department,   \nBSE Limited,  \nPhiroje Jeejeebhoy Towers,  \n25th Floor, Dalal Street,  \nMumbai – 400001\nThe Listing Department, \nNational Stock Exchange of India Limited,  \nExchange Plaza, 5th Floor, C / 1, ‘G’ Block,  \nBandra Kurla Complex, Bandra (East),  \nMumbai – 400051\nBSE SCRIP Code: 500112 \nNSE SCRIP Code: SBIN\nCC/S&B/AND/2024-25/130 \n                                                                   27.05.2024  \n  \nMadam/ Sir, \n \nDisclosure under Regulation 34 (1) of SEBI (LODR) Regulations, 2015: \nSubmission of Annual Report of the Bank for the financial year 2023-24  \n \nPursuant to Regulation 34 (1) and other applicable provisions of the SEBI (Listing \nObligations and Disclosure Requirements) Regulations, 2015, we submit the copy of',
  'metadata': {'page': 1,
   'bank': 'sbi',
   'year': '2024',
   'chunk_id': 'sbi_2024_p1_c0'}},
 {'content': 'Obligations and Disclosure Requirements) Regulations, 2015, we submit the copy of \nAnn