In [11]:
import os
import pdfplumber
import pandas as pd

def extract_folder_pdf_metadata(pdf_folder: str) -> pd.DataFrame:
    """
    Walk through all .pdf files in `pdf_folder`, open each with pdfplumber,
    extract metadata and number of pages, and return a DataFrame where each row
    corresponds to one PDF file.
    """
    records = []

    # Loop over every file in the directory
    for fname in os.listdir(pdf_folder):
        if not fname.lower().endswith(".pdf"):
            continue  # skip non-PDF files

        full_path = os.path.join(pdf_folder, fname)
        try:
            with pdfplumber.open(full_path) as pdf:
                # pdf.metadata is a dict (keys depend on what’s stored in PDF)
                meta = pdf.metadata or {}
                print(meta)
                # Number of pages in this PDF
                num_pages = len(pdf.pages)

                # Build a flat record for this PDF
                record = {
                    "filename":                 fname,
                    "full_path":                full_path,
                    "num_pages":                num_pages,
                    # pdfplumber’s metadata often includes keys like:
                    #   "Title", "Author", "Subject", "Producer", "CreationDate", etc.
                    # We’ll pull out a few common ones, falling back to None if missing.
                    "title":                    meta.get("Title"),
                    "author":                   meta.get("Author"),
                    "subject":                  meta.get("Subject"),
                    "producer":                 meta.get("Producer"),
                    "creation_date":            meta.get("CreationDate"),
                    "modification_date":        meta.get("ModDate") or meta.get("ModDate"),
                    # If there are any other keys you care about, you can add them here.
                }

                records.append(record)

        except Exception as e:
            # If pdfplumber fails to open or parse this file, record the error instead of metadata
            records.append({
                "filename":          fname,
                "full_path":         full_path,
                "num_pages":         None,
                "title":             None,
                "author":            None,
                "subject":           None,
                "producer":          None,
                "creation_date":     None,
                "modification_date": None,
                "error":             str(e),
            })

    # Convert the list of dicts into one DataFrame
    df = pd.DataFrame(records)
    return df


if __name__ == "__main__":
    # Change this path to the folder where your bank statement PDFs live
    PDF_FOLDER = "/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/BBL"

    # Extract metadata for every PDF in that folder
    metadata_df = extract_folder_pdf_metadata(PDF_FOLDER)

    # Show the first few rows
    #print(metadata_df.head())

    # Optionally: save to CSV so you can inspect it later
    #metadata_df
    #print(f"\nSaved metadata summary to: {output_csv}")


{'Producer': 'iText® 5.4.1 ©2000-2012 1T3XT BVBA (AGPL-version); modified using iText® 5.5.12 ©2000-2017 iText Group NV (AGPL-version)', 'ModDate': "D:20230817101502+07'00'"}
{'Producer': 'iText® 5.4.1 ©2000-2012 1T3XT BVBA (AGPL-version); modified using iText® 5.5.12 ©2000-2017 iText Group NV (AGPL-version)', 'ModDate': "D:20230613030617+07'00'"}
{'Producer': 'iText® 5.4.1 ©2000-2012 1T3XT BVBA (AGPL-version); modified using iText® 5.5.12 ©2000-2017 iText Group NV (AGPL-version)', 'ModDate': "D:20230811094215+07'00'"}
{'Producer': 'iText® 5.4.1 ©2000-2012 1T3XT BVBA (AGPL-version); modified using iText® 5.5.12 ©2000-2017 iText Group NV (AGPL-version)', 'ModDate': "D:20230822144814+07'00'"}
{'Producer': 'iText® 5.4.1 ©2000-2012 1T3XT BVBA (AGPL-version); modified using iText® 5.5.12 ©2000-2017 iText Group NV (AGPL-version)', 'ModDate': "D:20230818154622+07'00'"}
{'Producer': 'iText® 5.4.1 ©2000-2012 1T3XT BVBA (AGPL-version); modified using iText® 5.5.12 ©2000-2017 iText Group NV (AGPL

In [9]:
import os
import pdfplumber
import pandas as pd
from typing import Optional

def extract_pdf_metadata(pdf_path: str, password: Optional[str] = None) -> pd.DataFrame:
    """
    Open a single PDF file at `pdf_path` (using `password` if provided),
    extract metadata and number of pages, and return a DataFrame with one row.
    """
    record = {
        "filename":           os.path.basename(pdf_path),
        "full_path":          pdf_path,
        "num_pages":          None,
        "title":              None,
        "author":             None,
        "subject":            None,
        "producer":           None,
        "creation_date":      None,
        "modification_date":  None,
        "error":              None,
    }

    try:
        with pdfplumber.open(pdf_path, password=password) as pdf:
            meta = pdf.metadata or {}
            num_pages = len(pdf.pages)
            print(meta)

            record.update({
                "num_pages":         num_pages,
                "title":             meta.get("Title"),
                "author":            meta.get("Author"),
                "subject":           meta.get("Subject"),
                "producer":          meta.get("Producer"),
                "creation_date":     meta.get("CreationDate"),
                "modification_date": meta.get("ModDate"),
            })

    except Exception as e:
        record["error"] = str(e)

    return pd.DataFrame([record]),meta


if __name__ == "__main__":
    # Example usage: change this to the path of your single PDF file
    PDF_FILE = "/Users/if658228/Desktop/ktb/pdf-statement-extraction/STM_SA2641_01JUN24_30JUN24.pdf"
    PASSWORD = "08061998"  # or set to the PDF’s password, if it’s protected

    metadata_df,meta = extract_pdf_metadata(PDF_FILE, password=PASSWORD)
    print(metadata_df)



{'ModDate': "D:20250528084502+07'00'\t\t\t\t\t\t\t\t\t", 'Creator': 'JasperReports Library version 6.3.0\r\r\r\r\r\r\r\r\r\r\r\r\r', 'CreationDate': "D:20250528084502+07'00'\t\t\t\t\t\t\t\t\t", 'Producer': 'iText 2.1.7 by 1T3XT; modified using iText® 5.5.7 ©2000-2015 iText Group NV (AGPL-version)\x06\x06\x06\x06\x06\x06'}
                         filename  \
0  STM_SA2641_01JUN24_30JUN24.pdf   

                                           full_path  num_pages title author  \
0  /Users/if658228/Desktop/ktb/pdf-statement-extr...          1  None   None   

  subject                                           producer  \
0    None  iText 2.1.7 by 1T3XT; modified using iText® 5....   

                               creation_date  \
0  D:20250528084502+07'00'\t\t\t\t\t\t\t\t\t   

                           modification_date error  
0  D:20250528084502+07'00'\t\t\t\t\t\t\t\t\t  None  


In [10]:
meta

{'ModDate': "D:20250528084502+07'00'\t\t\t\t\t\t\t\t\t",
 'Creator': 'JasperReports Library version 6.3.0\r\r\r\r\r\r\r\r\r\r\r\r\r',
 'CreationDate': "D:20250528084502+07'00'\t\t\t\t\t\t\t\t\t",
 'Producer': 'iText 2.1.7 by 1T3XT; modified using iText® 5.5.7 ©2000-2015 iText Group NV (AGPL-version)\x06\x06\x06\x06\x06\x06'}

In [11]:
metadata_df

Unnamed: 0,filename,full_path,num_pages,title,author,subject,producer,creation_date,modification_date,error
0,STM_SA2641_01JUN24_30JUN24.pdf,/Users/if658228/Desktop/ktb/pdf-statement-extr...,1,,,,iText 2.1.7 by 1T3XT; modified using iText® 5....,D:20250528084502+07'00'\t\t\t\t\t\t\t\t\t,D:20250528084502+07'00'\t\t\t\t\t\t\t\t\t,


In [13]:
page.chars

[{'matrix': (1, 0, 0, 1, 413.0, 710.27),
  'fontname': 'LMUQZF+TFSrivichai',
  'adv': 4.42,
  'upright': True,
  'x0': 413.0,
  'y0': 705.9399999999999,
  'x1': 417.42,
  'y1': 715.9399999999999,
  'width': 4.420000000000016,
  'height': 10.0,
  'size': 10.0,
  'object_type': 'char',
  'page_number': 1,
  'ncs': 'DeviceRGB',
  'text': 'B',
  'stroking_color': None,
  'stroking_pattern': None,
  'non_stroking_color': (0, 0, 0),
  'non_stroking_pattern': None,
  'top': 147.06000000000006,
  'bottom': 157.06000000000006,
  'doctop': 147.06000000000006},
 {'matrix': (1, 0, 0, 1, 417.42, 710.27),
  'fontname': 'LMUQZF+TFSrivichai',
  'adv': 3.4400000000000004,
  'upright': True,
  'x0': 417.42,
  'y0': 705.9399999999999,
  'x1': 420.86,
  'y1': 715.9399999999999,
  'width': 3.4399999999999977,
  'height': 10.0,
  'size': 10.0,
  'object_type': 'char',
  'page_number': 1,
  'ncs': 'DeviceRGB',
  'text': 'a',
  'stroking_color': None,
  'stroking_pattern': None,
  'non_stroking_color': (0, 0,

In [15]:
import pdfplumber


def get_unique_fonts(pdf_path: str, password: str = None) -> list:
    """
    Open the PDF (using the given password, if any) and return a list
    of all unique font names found in its characters.
    """
    fonts = set()
    with pdfplumber.open(pdf_path, password=password) as pdf:
        for page in pdf.pages:
            for ch in page.chars:
                fontname = ch.get("fontname")
                if fontname:
                    # Strip off any subset-prefix (e.g. "ABCDEE+ArialMT" → "ArialMT")
                    if "+" in fontname:
                        fontname = fontname.split("+", 1)[1]
                    fonts.add(fontname)
    return sorted(fonts)


# Usage:
pdf_file = "/Users/if658228/Desktop/ktb/pdf-statement-extraction/STM_SA2641_01JUN24_30JUN24.pdf"
pwd = '08061998'
unique_fonts = get_unique_fonts(pdf_file, pwd)
print(unique_fonts)


['TFSrivichai', 'TFSrivichai-Bold']


In [None]:
import os
from typing import Optional, Dict, Any

import pdfplumber


class PDFMetadataExtractor:
    """
    A class with a single method to extract metadata and page count from a PDF using pdfplumber.
    """

    @staticmethod
    def extract_metadata(pdf_path: str, password: Optional[str] = None) -> Dict[str, Any]:
        """
        Extract metadata and page count from a single PDF file.

        Parameters
        ----------
        pdf_path : str
            Full path to the PDF file.
        password : Optional[str]
            Password for opening the PDF, if encrypted. Defaults to None.

        Returns
        -------
        Dict[str, Any]
            A dictionary containing:
            - "filename": Name of the PDF file
            - "full_path": Absolute path to the PDF
            - "num_pages": Number of pages in the PDF
            - Any metadata fields present in the PDF (e.g., "Title", "Author", etc.)
            - An "error" key if opening/parsing the PDF fails.
        """
        record: Dict[str, Any] = {
            "filename": os.path.basename(pdf_path),
            "full_path": os.path.abspath(pdf_path),
            "num_pages": None,
        }

        try:
            with pdfplumber.open(pdf_path, password=password) as pdf:
                meta = pdf.metadata or {}
                record["num_pages"] = len(pdf.pages)
                for key, value in meta.items():
                    record[key] = value

        except Exception as e:
            record["error"] = str(e)

        return record
