In [1]:
# config.py

import re  # Required to compile and use regular expressions for pattern matching
from typing import Optional, Tuple, List, Dict
# PDF source settings
PDF_PATH: Optional[str] = None
PASSWORD: Optional[str] = None
# ------------------------------------------------------------------------------
# Path / Password
# ------------------------------------------------------------------------------
PDF_PATH: str = "/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/BAY/108988-02054592-2566_1_BAY.pdf"
PASSWORD: str | None = None

# ------------------------------------------------------------------------------
# Page-ID Cropping Boxes (try multiple to guard against minor layout shifts)
# ------------------------------------------------------------------------------
PAGE_ID_CROPS: list[tuple[float, float, float, float]] = [
    (550.7, 23.6, 600.0, 40.3),
    (550.7, 23.6, 594.0, 40.3),
]

# ------------------------------------------------------------------------------
# Header Fields: bounding boxes for account_name, account_number, period
# ------------------------------------------------------------------------------
HEADER_CROPS: dict[str, tuple[float, float, float, float]] = {
    "account_name":   (370.0, 125.9, 500.2, 132.9),
    "account_number": (384.0, 137.9, 438.5, 144.9),
    "period":         (384.0, 161.9, 500.4, 168.9),
}

# ------------------------------------------------------------------------------
# Footer Keywords (English / Thai) to detect total‐withdrawal / total‐deposit lines
# ------------------------------------------------------------------------------
FOOTER_KEYWORDS_HEADER: list[str] = [
    "Total Withdrawal", "Total Deposit",
    "รายการถอนเงิน", "รายการฝากเงิน"
]

# ------------------------------------------------------------------------------
# pdfplumber Table‐Finding Settings (vertical/horizontal strategy, tolerance)
# ------------------------------------------------------------------------------
TABLE_SETTINGS: dict[str, str | int] = {
    "vertical_strategy":    "lines",
    "horizontal_strategy":  "lines",
    "intersection_tolerance": 1,
}

# ------------------------------------------------------------------------------
# Regex Patterns for Date, Time, Money (strings only; compilation happens in main code)
# ------------------------------------------------------------------------------
DATE_PATTERN: str = r"^\d{2}/\d{2}/\d{4}$"
TIME_PATTERN: str = r"^\d{2}:\d{2}:\d{2}$"
MONEY_PATTERN: str = r"^[\d,]+\.\d{2}$"

# ------------------------------------------------------------------------------
# X-coordinate splits for columns (all values in PDF points)
# ------------------------------------------------------------------------------
DATE_COLUMN_X0: float = 1.0
DATE_COLUMN_X1: float = 30.0

CODE_CHANNEL_SPLIT_X: float = 120.0
CHANNEL_SPLIT_X: float = 450.0
CHANNEL_DC_SPLIT_X: float = 200.0
WITHDRAW_DEPOSIT_SPLIT_X: float = 278.0
DC_BALANCE_SPLIT_X: float = 320.0
BALANCE_DESCRIPTION_SPLIT_X: float = 400.0

# ------------------------------------------------------------------------------
# Tolerances and Margins
# ------------------------------------------------------------------------------
X_TOLERANCE: float = 1.0
Y_MARGIN: float = 3.0

# ------------------------------------------------------------------------------
# Footer Keywords for transaction table (so we can crop off the bottom totals)
# ------------------------------------------------------------------------------
TABLE_FOOTER_KEYWORDS: list[str] = ["รายการถอนเงิน", "Total Withdrawal"]
TABLE_FOOTER_MARGIN: float = 3.0


In [1]:
# extractor.py

import re
import pdfplumber
import pandas as pd

from typing import (
    Optional,
    Dict,
    Tuple,
    List,
)

import config

class BayStatementExtractor:
    """
    Encapsulates extraction of headers and transactions from a BAY bank-statement PDF.
    """

    def __init__(self, pdf_path: str = config.PDF_PATH, password: Optional[str] = config.PASSWORD):
        self.pdf_path = pdf_path
        self.password = password

        # Precompile regexes once:
        self._date_regex = re.compile(config.DATE_PATTERN)
        self._time_regex = re.compile(config.TIME_PATTERN)
        self._money_regex = re.compile(config.MONEY_PATTERN)

    @staticmethod
    def clean_float_column(series: pd.Series) -> pd.Series:
        """
        Strip non-numeric characters from a column of strings, and convert to float.
        """
        cleaned = series.astype(str).apply(lambda s: re.sub(r"[^0-9\.]", "", s))
        return pd.to_numeric(cleaned, errors="coerce")

    @staticmethod
    def clean_page_id(raw_page_id: str) -> str:
        """
        Standardize a raw page-id string into "N/M" format.
        """
        numeric_parts = re.findall(r"\d+", raw_page_id)
        if len(numeric_parts) >= 2:
            candidate = f"{numeric_parts[0]}/{numeric_parts[1]}"
            if re.fullmatch(r"\d+/\d+", candidate):
                return candidate
        return ""

    def extract_page_id(self, page: pdfplumber.page.Page) -> str:
        """
        Try multiple crop‐regions until we successfully read a page-id in "N/M" format.
        """
        for bbox in config.PAGE_ID_CROPS:
            try:
                raw_text = page.crop(bbox).extract_text() or ""
                page_id_candidate = self.clean_page_id(raw_text.strip())
                if page_id_candidate:
                    return page_id_candidate
            except Exception as exc:
                print(f"⚠️ Error extracting page ID from page {getattr(page, 'page_number', '?')}: {exc}")
                continue

        # If none of the crops worked:
        return ""

    def extract_header_dataframe(self) -> pd.DataFrame:
        """
        Loop over every page in the PDF and build a DataFrame of header‐fields:
        - page_id
        - account_name
        - account_number
        - period
        - total_withdrawal_transaction, total_withdrawal
        - total_deposit_transaction, total_deposit
        """
        header_records: List[Dict[str, Optional[str]]] = []

        with pdfplumber.open(self.pdf_path, password=self.password) as pdf_document:
            for page_index, page in enumerate(pdf_document.pages, start=1):
                try:
                    page_id = self.extract_page_id(page)
                    full_text = page.extract_text() or ""
                    footer_present = any(
                        kw.lower() in full_text.lower()
                        for kw in config.FOOTER_KEYWORDS_HEADER
                    )
                    # Only attempt to read account_name / number if we see Thai or English keywords:
                    has_header_info = any(
                        kw.lower() in full_text.lower()
                        for kw in ["ชื่อบัญชี", "Account No."]
                    )

                    # Initialize one record for this page:
                    header_record: Dict[str, Optional[str]] = {"page_id": page_id}

                    # Extract each field from its bbox, if we are on a statement page:
                    for field_name, bbox in config.HEADER_CROPS.items():
                        try:
                            if has_header_info:
                                raw_field_text = page.crop(bbox).extract_text() or ""
                                header_record[field_name] = raw_field_text.strip().replace("\n", " ")
                            else:
                                header_record[field_name] = None
                        except Exception as exc:
                            print(f"⚠️ Error extracting header '{field_name}' on page {page_index}: {exc}")
                            header_record[field_name] = None

                    # If page footer is present, parse "Total Withdrawal / Deposit" lines
                    if footer_present:
                        for line in full_text.splitlines():
                            try:
                                if line.startswith(("Total Withdrawal", "รายการถอนเงิน")):
                                    numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                                    header_record["total_withdrawal_transaction"] = numbers[0].replace(",", "") if len(numbers) > 0 else None
                                    header_record["total_withdrawal"] = numbers[1].replace(",", "") if len(numbers) > 1 else None

                                elif line.startswith(("Total Deposit", "รายการฝากเงิน")):
                                    numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                                    header_record["total_deposit_transaction"] = numbers[0].replace(",", "") if len(numbers) > 0 else None
                                    header_record["total_deposit"] = numbers[1].replace(",", "") if len(numbers) > 1 else None

                            except Exception as exc:
                                print(f"⚠️ Error extracting footer totals on page {page_index}: {exc}")

                    header_records.append(header_record)

                except Exception as exc:
                    print(f"⚠️ Skipping header extraction on page {page_index} due to: {exc}")
                    continue

        header_dataframe = pd.DataFrame(header_records)

        # If "period" column exists, split it into two datetimes: start_period / end_period
        if "period" in header_dataframe.columns:
            # Work on a copy so we don’t overwrite the original slice:
            working_copy = header_dataframe.copy()
            period_series = working_copy["period"].fillna("").str.replace(" ", "")
            split_period = period_series.str.split(r"[-–]", n=1, expand=True)

            # If no second part, create an empty column
            if split_period.shape[1] < 2:
                split_period[1] = None

            working_copy["start_period"] = pd.to_datetime(
                split_period[0], dayfirst=True, errors="coerce"
            )
            working_copy["end_period"] = pd.to_datetime(
                split_period[1], dayfirst=True, errors="coerce"
            )

            # Assign those back to our main DataFrame:
            header_dataframe = working_copy

        return header_dataframe

    def extract_transaction_dataframe(self) -> pd.DataFrame:
        """
        Loop over each page, find table regions, group words by row, and build
        a DataFrame with columns:
        page_id, date, time, code, channel, withdrawal, deposit, balance, description
        """
        transaction_records: List[Dict[str, Optional[str | float]]] = []

        with pdfplumber.open(self.pdf_path, password=self.password) as pdf_document:
            for page_index, page in enumerate(pdf_document.pages, start=1):
                try:
                    page_id = self.extract_page_id(page)

                    # Attempt to find tables via pdfplumber
                    try:
                        tables_on_page = page.find_tables(config.TABLE_SETTINGS)
                    except Exception as exc:
                        print(f"⚠️ Error finding tables on page {page_index}: {exc}")
                        tables_on_page = []

                    # If tables exist, crop each region; else use the entire page
                    if tables_on_page:
                        regions = []
                        for table_obj in tables_on_page:
                            try:
                                regions.append(page.crop(table_obj.bbox))
                            except Exception as exc:
                                print(f"⚠️ Error cropping table region on page {page_index}: {exc}")
                            # continue cropping next table if one fails
                    else:
                        regions = [page]

                    # Now process each “region” (either a cropped table or the full page)
                    for region in regions:
                        try:
                            words = region.extract_words(use_text_flow=True)
                        except Exception as exc:
                            print(f"⚠️ Error extracting words on page {page_index}: {exc}")
                            continue

                        # Group words into rows by their vertical position
                        row_buckets: Dict[int, List[dict]] = {}
                        for word in words:
                            row_key = int(word["top"] // config.Y_MARGIN)
                            row_buckets.setdefault(row_key, []).append(word)

                        # Look for any footer‐keyword lines, so we can crop them off:
                        footer_y_positions = [
                            min(w["top"] for w in one_row)
                            for one_row in row_buckets.values()
                            if any(kw in " ".join(w["text"] for w in one_row) for kw in config.TABLE_FOOTER_KEYWORDS)
                        ]

                        if footer_y_positions:
                            cutoff = min(footer_y_positions) - config.TABLE_FOOTER_MARGIN
                            full_height = region.bbox[3] - region.bbox[1]
                            if 0 < cutoff < full_height:
                                try:
                                    # Crop off footer lines
                                    cropped_region = region.crop((0, 0, region.width, cutoff), relative=True)
                                    words = cropped_region.extract_words(use_text_flow=True)
                                except Exception as exc:
                                    print(f"⚠️ Error cropping footer on page {page_index}: {exc}")

                                # Re‐group after cropping:
                                row_buckets.clear()
                                for word in words:
                                    row_key = int(word["top"] // config.Y_MARGIN)
                                    row_buckets.setdefault(row_key, []).append(word)

                        # Build “intervals” (y‐ranges) to bucket rows
                        sorted_tops = sorted(
                            w["top"]
                            for w in words
                            if self._date_regex.match(w["text"]) and
                            config.DATE_COLUMN_X0 <= w["x0"] <= config.DATE_COLUMN_X1
                        )
                        if not sorted_tops:
                            # No date‐like words → skip page/region
                            continue

                        intervals: list[tuple[float, float]] = []
                        for idx_top, y_val in enumerate(sorted_tops):
                            start_y = y_val - config.Y_MARGIN
                            if idx_top + 1 < len(sorted_tops):
                                next_y = sorted_tops[idx_top + 1]
                                end_y = next_y - config.Y_MARGIN
                            else:
                                # Estimate the “next” if this is the last row:
                                previous_y = sorted_tops[idx_top - 1] if idx_top > 0 else (y_val - 2 * config.Y_MARGIN)
                                end_y = y_val + (y_val - previous_y) - config.Y_MARGIN

                            intervals.append((start_y, end_y))

                        # Prepare empty row list for each interval
                        rows_of_words: list[list[dict]] = [[] for _ in intervals]
                        for word in words:
                            for interval_index, (start_y, end_y) in enumerate(intervals):
                                if start_y <= word["top"] < end_y:
                                    rows_of_words[interval_index].append(word)
                                    break

                        # Now parse each “row” into structured fields
                        for row_words in rows_of_words:
                            if not row_words:
                                continue

                            # Sort words by (y, x) to get left→right reading order
                            row_words_sorted = sorted(row_words, key=lambda w: (w["top"], w["x0"]))
                            row_text_combined = " ".join(w["text"] for w in row_words_sorted)

                            # Skip total‐amount lines in table
                            if any(lbl in row_text_combined for lbl in ("TOTAL AMOUNTS", "TOTAL ITEMS")):
                                continue

                            # Extract date / time
                            date_text = next(
                                (w["text"] for w in row_words_sorted if self._date_regex.match(w["text"])),
                                ""
                            )
                            time_text = next(
                                (w["text"] for w in row_words_sorted if self._time_regex.match(w["text"])),
                                ""
                            )

                            # Prepare buckets for code/channel/description/dc/balance
                            code_parts:    list[str] = []
                            channel_parts: list[str] = []
                            description_parts: list[str] = []
                            dc_word_candidates:  list[dict] = []
                            balance_word_candidates: list[dict] = []

                            for w in row_words_sorted:
                                text_token = w["text"]
                                x0 = w["x0"]

                                if self._date_regex.match(text_token) or self._time_regex.match(text_token):
                                    continue

                                if self._money_regex.match(text_token):
                                    # Decide if it’s a withdrawal/deposit or a balance number
                                    if config.CHANNEL_DC_SPLIT_X <= x0 <= config.DC_BALANCE_SPLIT_X:
                                        dc_word_candidates.append(w)
                                    elif config.DC_BALANCE_SPLIT_X <= x0 <= config.BALANCE_DESCRIPTION_SPLIT_X:
                                        balance_word_candidates.append(w)
                                    continue

                                # If left of CODE_CHANNEL_SPLIT_X, it’s code
                                if x0 <= config.CODE_CHANNEL_SPLIT_X + config.X_TOLERANCE:
                                    code_parts.append(text_token)
                                elif x0 <= config.CHANNEL_SPLIT_X + config.X_TOLERANCE:
                                    channel_parts.append(text_token)
                                else:
                                    description_parts.append(text_token)

                            withdrawal_value: Optional[float] = None
                            deposit_value:   Optional[float] = None
                            for candidate in dc_word_candidates:
                                try:
                                    numeric_val = float(candidate["text"].replace(",", ""))
                                except Exception as exc:
                                    print(f"⚠️ Error converting withdrawal/deposit to float on page {page_index}: {exc}")
                                    numeric_val = None

                                if numeric_val is not None:
                                    # Compare x1 to determine withdrawal vs deposit
                                    if candidate["x1"] <= config.WITHDRAW_DEPOSIT_SPLIT_X:
                                        withdrawal_value = numeric_val
                                    else:
                                        deposit_value = numeric_val

                            balance_value: Optional[float] = None
                            if balance_word_candidates:
                                # Pick the leftmost word whose x0 >= DC_BALANCE_SPLIT_X
                                chosen = next(
                                    (w for w in balance_word_candidates if w["x0"] >= config.DC_BALANCE_SPLIT_X),
                                    None
                                )
                                if chosen:
                                    try:
                                        balance_value = float(chosen["text"].replace(",", ""))
                                    except Exception as exc:
                                        print(f"⚠️ Error converting balance to float on page {page_index}: {exc}")
                                        balance_value = None

                            transaction_records.append({
                                "page_id":    page_id,
                                "date":       date_text,
                                "time":       time_text,
                                "code":       " ".join(code_parts),
                                "channel":    " ".join(channel_parts),
                                "withdrawal": withdrawal_value,
                                "deposit":    deposit_value,
                                "balance":    balance_value,
                                "description":" ".join(description_parts),
                            })

                except Exception as exc:
                    print(f"⚠️ Skipping transactions on page {page_index} due to: {exc}")
                    continue

        transaction_dataframe = pd.DataFrame(transaction_records)

        # Ensure all expected columns exist (even if empty)
        expected_columns = [
            "page_id", "date", "time", "code", "channel",
            "withdrawal", "deposit", "balance", "description"
        ]
        for col_name in expected_columns:
            if col_name not in transaction_dataframe.columns:
                transaction_dataframe[col_name] = ""

        # Reorder columns
        transaction_dataframe = transaction_dataframe[expected_columns]

        # Convert withdrawal / deposit to numeric floats
        transaction_dataframe["withdrawal"] = pd.to_numeric(
            transaction_dataframe["withdrawal"], errors="coerce"
        )
        transaction_dataframe["deposit"] = pd.to_numeric(
            transaction_dataframe["deposit"], errors="coerce"
        )

        # Parse dates into datetime (if any)
        if not transaction_dataframe.empty:
            transaction_dataframe["date"] = pd.to_datetime(
                transaction_dataframe["date"],
                format="%d/%m/%Y",
                dayfirst=True,
                errors="coerce"
            )

        return transaction_dataframe

    def clean_extracted_data(
        self,
        header_dataframe: pd.DataFrame,
        transaction_dataframe: pd.DataFrame
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Standardize column names, rename withdrawal→debit, deposit→credit, 
        drop unused columns, fill NaNs, and cast to float where needed.
        """
        # Work on copies to avoid assignment‐on‐slice issues:
        header_copy = header_dataframe.copy()
        transaction_copy = transaction_dataframe.copy()

        # Select and rename in header: total_withdrawal→total_debit, etc.
        header_selected = header_copy[[
            "page_id", "account_name", "account_number", "period",
            "total_withdrawal", "total_deposit",
            "total_withdrawal_transaction", "total_deposit_transaction"
        ]].copy()

        header_selected = header_selected.rename(columns={
            "total_withdrawal":             "total_debit",
            "total_deposit":                "total_credit",
            "total_withdrawal_transaction": "total_debit_transaction",
            "total_deposit_transaction":    "total_credit_transaction"
        })

        # Rename in transactions: withdrawal→debit, deposit→credit
        transaction_selected = transaction_copy.copy()
        transaction_selected = transaction_selected.rename(columns={
            "withdrawal": "debit",
            "deposit":    "credit"
        })

        # Create transaction_type column from “code”, then drop “code”
        transaction_selected["transaction_type"] = transaction_selected["code"]
        transaction_selected["code"] = None

        # Clean page_id strings in both DataFrames
        header_selected["page_id"] = header_selected["page_id"].apply(self.clean_page_id)
        transaction_selected["page_id"] = transaction_selected["page_id"].apply(self.clean_page_id)

        # Fill missing with empty strings
        header_selected.fillna("", inplace=True)
        transaction_selected.fillna("", inplace=True)

        # Add empty “address” column
        header_selected["address"] = ""

        # Cast numeric columns to floats
        for col_name in ["debit", "credit", "balance"]:
            if col_name in transaction_selected.columns:
                transaction_selected[col_name] = self.clean_float_column(transaction_selected[col_name])

        for col_name in [
            "total_debit", "total_credit",
            "total_debit_transaction", "total_credit_transaction"
        ]:
            if col_name in header_selected.columns:
                header_selected[col_name] = self.clean_float_column(header_selected[col_name])

        return header_selected, transaction_selected

    def run(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Open the PDF, extract headers and transactions, clean them, and return two DataFrames.
        """
        header_df = self.extract_header_dataframe()
        transaction_df = self.extract_transaction_dataframe()
        cleaned_header_df, cleaned_transaction_df = self.clean_extracted_data(header_df, transaction_df)
        return cleaned_header_df, cleaned_transaction_df


if __name__ == "__main__":
    extractor = BayStatementExtractor()
    header_dataframe, transaction_dataframe = extractor.run()

    print("\n--- ➔ HEADER DATAFRAME ---")
    print(header_dataframe)

    print("\n--- ➔ TRANSACTION DATAFRAME ---")
    print(transaction_dataframe)



--- ➔ HEADER DATAFRAME ---
   page_id             account_name account_number                   period  \
0     1/14  นาย อนันตชัย วัชรเสถียร  XXX-1-64478-X  28/02/2023 - 28/08/2023   
1     2/14                                                                    
2     3/14                                                                    
3     4/14                                                                    
4     5/14                                                                    
5     6/14                                                                    
6     7/14                                                                    
7     8/14                                                                    
8     9/14                                                                    
9    10/14                                                                    
10   11/14                                                                    
11   12/14              

  transaction_selected.fillna("", inplace=True)


In [1]:
# extractor.py

import re
import pdfplumber
import pandas as pd
from typing import Optional, Dict, Tuple, List

import config


class BayStatementExtractor:
    """
    Encapsulates extraction of headers and transactions from a BAY bank‐statement PDF.
    Now split into two methods that accept `pages` directly.
    """

    def __init__(self):
        # Precompile regexes once:
        self._date_regex = re.compile(config.DATE_PATTERN)
        self._time_regex = re.compile(config.TIME_PATTERN)
        self._money_regex = re.compile(config.MONEY_PATTERN)

    
    def clean_float_column(series: pd.Series) -> pd.Series:
        """
        Strip non‐numeric characters from a column of strings, and convert to float.
        """
        cleaned = series.astype(str).apply(lambda s: re.sub(r"[^0-9\.]", "", s))
        return pd.to_numeric(cleaned, errors="coerce")

    
    def clean_page_id(raw_page_id: str) -> str:
        """
        Standardize a raw page‐id string into "N/M" format.
        """
        numeric_parts = re.findall(r"\d+", raw_page_id)
        if len(numeric_parts) >= 2:
            candidate = f"{numeric_parts[0]}/{numeric_parts[1]}"
            if re.fullmatch(r"\d+/\d+", candidate):
                return candidate
        return ""

    def extract_page_id(self, page: pdfplumber.page.Page) -> str:
        """
        Try multiple crop‐regions until we successfully read a page‐id in "N/M" format.
        """
        for bbox in config.PAGE_ID_CROPS:
            try:
                raw_text = page.crop(bbox).extract_text() or ""
                page_id_candidate = self.clean_page_id(raw_text.strip())
                if page_id_candidate:
                    return page_id_candidate
            except Exception:
                continue

        return ""

    def extract_headers_from_pages(
        self, pages: List[pdfplumber.page.Page]
    ) -> List[Dict[str, Optional[str]]]:
        """
        Loop over a list of `pdfplumber.page.Page` objects and build a list of header‐dicts:
          - page_id
          - account_name
          - account_number
          - period
          - total_withdrawal_transaction, total_withdrawal
          - total_deposit_transaction, total_deposit
        (Exactly the same logic as in extract_header_dataframe, but acting on `pages` directly.)
        """
        header_records: List[Dict[str, Optional[str]]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = self.extract_page_id(page)
                full_text = page.extract_text() or ""
                footer_present = any(
                    kw.lower() in full_text.lower()
                    for kw in config.FOOTER_KEYWORDS_HEADER
                )
                has_header_info = any(
                    kw.lower() in full_text.lower()
                    for kw in ["ชื่อบัญชี", "Account No."]
                )

                header_record: Dict[str, Optional[str]] = {"page_id": page_id}

                # Extract each field from its bbox, if we see header keywords
                for field_name, bbox in config.HEADER_CROPS.items():
                    try:
                        if has_header_info:
                            raw_field_text = page.crop(bbox).extract_text() or ""
                            header_record[field_name] = raw_field_text.strip().replace("\n", " ")
                        else:
                            header_record[field_name] = None
                    except Exception:
                        header_record[field_name] = None

                # If footer is present, parse "Total Withdrawal / Deposit" lines
                if footer_present:
                    for line in full_text.splitlines():
                        if line.startswith(("Total Withdrawal", "รายการถอนเงิน")):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_record["total_withdrawal_transaction"] = (
                                numbers[0].replace(",", "") if len(numbers) > 0 else None
                            )
                            header_record["total_withdrawal"] = (
                                numbers[1].replace(",", "") if len(numbers) > 1 else None
                            )

                        elif line.startswith(("Total Deposit", "รายการฝากเงิน")):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_record["total_deposit_transaction"] = (
                                numbers[0].replace(",", "") if len(numbers) > 0 else None
                            )
                            header_record["total_deposit"] = (
                                numbers[1].replace(",", "") if len(numbers) > 1 else None
                            )

                header_records.append(header_record)

            except Exception:
                # Skip this page if anything goes wrong
                continue

        # After collecting all page‐headers, post‐process “period” column into two datetimes
        header_df = pd.DataFrame(header_records)
        if "period" in header_df.columns:
            working_copy = header_df.copy()
            period_series = working_copy["period"].fillna("").str.replace(" ", "")
            split_period = period_series.str.split(r"[-–]", n=1, expand=True)

            if split_period.shape[1] < 2:
                split_period[1] = None

            working_copy["start_period"] = pd.to_datetime(
                split_period[0], dayfirst=True, errors="coerce"
            )
            working_copy["end_period"] = pd.to_datetime(
                split_period[1], dayfirst=True, errors="coerce"
            )
            header_df = working_copy

        # Return the list of raw dicts (you can wrap in DataFrame externally)
        return header_df.to_dict(orient="records")

    def extract_transactions_from_pages(
        self, pages: List[pdfplumber.page.Page]
    ) -> List[Dict[str, Optional[str or float]]]:
        """
        Loop over each page, find table regions, group words by row, and build
        a list of transaction‐dicts with keys:
          page_id, date, time, code, channel, withdrawal, deposit, balance, description
        (Same as extract_transaction_dataframe, but on `pages` directly.)
        """
        transaction_records: List[Dict[str, Optional[str or float]]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = self.extract_page_id(page)

                # Find tables via pdfplumber (if any)
                try:
                    tables_on_page = page.find_tables(config.TABLE_SETTINGS)
                except Exception:
                    tables_on_page = []

                if tables_on_page:
                    regions = []
                    for table_obj in tables_on_page:
                        try:
                            regions.append(page.crop(table_obj.bbox))
                        except Exception:
                            pass
                else:
                    regions = [page]

                for region in regions:
                    try:
                        words = region.extract_words(use_text_flow=True)
                    except Exception:
                        continue

                    # Bucket words into rows by vertical position
                    row_buckets: Dict[int, List[dict]] = {}
                    for word in words:
                        row_key = int(word["top"] // config.Y_MARGIN)
                        row_buckets.setdefault(row_key, []).append(word)

                    # Check for any footer lines in those rows
                    footer_y_positions = [
                        min(w["top"] for w in one_row)
                        for one_row in row_buckets.values()
                        if any(
                            kw in " ".join(w["text"] for w in one_row)
                            for kw in config.TABLE_FOOTER_KEYWORDS
                        )
                    ]

                    if footer_y_positions:
                        cutoff = min(footer_y_positions) - config.TABLE_FOOTER_MARGIN
                        full_height = region.bbox[3] - region.bbox[1]
                        if 0 < cutoff < full_height:
                            try:
                                cropped_region = region.crop((0, 0, region.width, cutoff), relative=True)
                                words = cropped_region.extract_words(use_text_flow=True)
                            except Exception:
                                pass

                            # Re‐bucket after cropping
                            row_buckets.clear()
                            for word in words:
                                row_key = int(word["top"] // config.Y_MARGIN)
                                row_buckets.setdefault(row_key, []).append(word)

                    # Build “interval” y‐ranges from any date‐like words
                    sorted_tops = sorted(
                        w["top"]
                        for w in words
                        if self._date_regex.match(w["text"])
                        and config.DATE_COLUMN_X0 <= w["x0"] <= config.DATE_COLUMN_X1
                    )
                    if not sorted_tops:
                        continue

                    intervals: List[Tuple[float, float]] = []
                    for idx_top, y_val in enumerate(sorted_tops):
                        start_y = y_val - config.Y_MARGIN
                        if idx_top + 1 < len(sorted_tops):
                            next_y = sorted_tops[idx_top + 1]
                            end_y = next_y - config.Y_MARGIN
                        else:
                            previous_y = (
                                sorted_tops[idx_top - 1]
                                if idx_top > 0
                                else (y_val - 2 * config.Y_MARGIN)
                            )
                            end_y = y_val + (y_val - previous_y) - config.Y_MARGIN
                        intervals.append((start_y, end_y))

                    # Assign words to each interval (row)
                    rows_of_words: List[List[dict]] = [[] for _ in intervals]
                    for word in words:
                        for interval_index, (start_y, end_y) in enumerate(intervals):
                            if start_y <= word["top"] < end_y:
                                rows_of_words[interval_index].append(word)
                                break

                    # Parse each “row” into structured fields
                    for row_words in rows_of_words:
                        if not row_words:
                            continue

                        row_words_sorted = sorted(row_words, key=lambda w: (w["top"], w["x0"]))
                        row_text_combined = " ".join(w["text"] for w in row_words_sorted)

                        # Skip total lines
                        if any(lbl in row_text_combined for lbl in ("TOTAL AMOUNTS", "TOTAL ITEMS")):
                            continue

                        date_text = next(
                            (w["text"] for w in row_words_sorted if self._date_regex.match(w["text"])),
                            ""
                        )
                        time_text = next(
                            (w["text"] for w in row_words_sorted if self._time_regex.match(w["text"])),
                            ""
                        )

                        code_parts: List[str] = []
                        channel_parts: List[str] = []
                        description_parts: List[str] = []
                        dc_word_candidates: List[dict] = []
                        balance_word_candidates: List[dict] = []

                        for w in row_words_sorted:
                            text_token = w["text"]
                            x0 = w["x0"]

                            if self._date_regex.match(text_token) or self._time_regex.match(text_token):
                                continue

                            if self._money_regex.match(text_token):
                                # Is it debit/credit or balance?
                                if config.CHANNEL_DC_SPLIT_X <= x0 <= config.DC_BALANCE_SPLIT_X:
                                    dc_word_candidates.append(w)
                                elif config.DC_BALANCE_SPLIT_X <= x0 <= config.BALANCE_DESCRIPTION_SPLIT_X:
                                    balance_word_candidates.append(w)
                                continue

                            if x0 <= config.CODE_CHANNEL_SPLIT_X + config.X_TOLERANCE:
                                code_parts.append(text_token)
                            elif x0 <= config.CHANNEL_SPLIT_X + config.X_TOLERANCE:
                                channel_parts.append(text_token)
                            else:
                                description_parts.append(text_token)

                        withdrawal_value: Optional[float] = None
                        deposit_value: Optional[float] = None
                        for candidate in dc_word_candidates:
                            try:
                                numeric_val = float(candidate["text"].replace(",", ""))
                            except Exception:
                                numeric_val = None

                            if numeric_val is not None:
                                if candidate["x1"] <= config.WITHDRAW_DEPOSIT_SPLIT_X:
                                    withdrawal_value = numeric_val
                                else:
                                    deposit_value = numeric_val

                        balance_value: Optional[float] = None
                        if balance_word_candidates:
                            chosen = next(
                                (w for w in balance_word_candidates if w["x0"] >= config.DC_BALANCE_SPLIT_X),
                                None
                            )
                            if chosen:
                                try:
                                    balance_value = float(chosen["text"].replace(",", ""))
                                except Exception:
                                    balance_value = None

                        transaction_records.append({
                            "page_id":    page_id,
                            "date":       date_text,
                            "time":       time_text,
                            "code":       " ".join(code_parts),
                            "channel":    " ".join(channel_parts),
                            "withdrawal": withdrawal_value,
                            "deposit":    deposit_value,
                            "balance":    balance_value,
                            "description": " ".join(description_parts),
                        })

            except Exception:
                continue

        return transaction_records

    def clean_extracted_data(
        self,
        header_dataframe: pd.DataFrame,
        transaction_dataframe: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Standardize column names, rename withdrawal→debit, deposit→credit, 
        drop unused columns, fill NaNs, and cast to float where needed.
        """
        # Work on copies to avoid assignment-on-slice issues:
        header_copy = header_dataframe.copy()
        transaction_copy = transaction_dataframe.copy()

        # Select and rename in header: total_withdrawal→total_debit, etc.
        header_selected = header_copy[[
            "page_id", "account_name", "account_number", "period",
            "total_withdrawal", "total_deposit",
            "total_withdrawal_transaction", "total_deposit_transaction"
        ]].copy()

        header_selected = header_selected.rename(columns={
            "total_withdrawal":             "total_debit",
            "total_deposit":                "total_credit",
            "total_withdrawal_transaction": "total_debit_transaction",
            "total_deposit_transaction":    "total_credit_transaction"
        })

        # Rename in transactions: withdrawal→debit, deposit→credit
        transaction_selected = transaction_copy.copy()
        transaction_selected = transaction_selected.rename(columns={
            "withdrawal": "debit",
            "deposit":    "credit"
        })

        # Create transaction_type column from “code”, then drop “code”
        transaction_selected["transaction_type"] = transaction_selected["code"]
        transaction_selected["code"] = None

        # Clean page_id strings in both DataFrames
        header_selected["page_id"] = header_selected["page_id"].apply(self.clean_page_id)
        transaction_selected["page_id"] = transaction_selected["page_id"].apply(self.clean_page_id)

        # Fill missing with empty strings
        header_selected.fillna("", inplace=True)
        transaction_selected.fillna("", inplace=True)

        # Add empty “address” column
        header_selected["address"] = ""

        # Cast numeric columns to floats
        for col_name in ["debit", "credit", "balance"]:
            if col_name in transaction_selected.columns:
                transaction_selected[col_name] = self.clean_float_column(transaction_selected[col_name])

        for col_name in [
            "total_debit", "total_credit",
            "total_debit_transaction", "total_credit_transaction"
        ]:
            if col_name in header_selected.columns:
                header_selected[col_name] = self.clean_float_column(header_selected[col_name])

        return header_selected, transaction_selected

    def run(self, pdf_path: str, password: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Convenience method: open the PDF, extract headers & transactions,
        clean them, and return two DataFrames.
        """
        with pdfplumber.open(pdf_path, password=password) as pdf:
            pages = pdf.pages
            raw_header_records = self.extract_headers_from_pages(pages)
            raw_transaction_records = self.extract_transactions_from_pages(pages)

        # Convert raw lists of dicts into DataFrames
        header_df = pd.DataFrame(raw_header_records)
        transaction_df = pd.DataFrame(raw_transaction_records)

        # Clean them
        cleaned_header_df, cleaned_transaction_df = self.clean_extracted_data(
            header_df, transaction_df
        )
        return cleaned_header_df, cleaned_transaction_df





In [8]:
import re
import pdfplumber
import pandas as pd
from typing import Optional, Dict, Tuple, List

import config


class BayStatementExtractor:
    """
    Encapsulates extraction of headers and transactions from a BAY bank‐statement PDF.
    Now split into two methods that accept `pages` directly.
    """

    def __init__(self):
        # Precompile regexes once:
        self._date_regex = re.compile(config.DATE_PATTERN)
        self._time_regex = re.compile(config.TIME_PATTERN)
        self._money_regex = re.compile(config.MONEY_PATTERN)

    @staticmethod
    def clean_float_column(series: pd.Series) -> pd.Series:
        """
        Strip non‐numeric characters from a column of strings, and convert to float.
        """
        cleaned = series.astype(str).apply(lambda s: re.sub(r"[^0-9\.]", "", s))
        return pd.to_numeric(cleaned, errors="coerce")

    @staticmethod
    def clean_page_id(raw_page_id: str) -> str:
        """
        Standardize a raw page‐id string into "N/M" format.
        """
        numeric_parts = re.findall(r"\d+", raw_page_id)
        if len(numeric_parts) >= 2:
            candidate = f"{numeric_parts[0]}/{numeric_parts[1]}"
            if re.fullmatch(r"\d+/\d+", candidate):
                return candidate
        return ""

    def extract_page_id(self, page: pdfplumber.page.Page) -> str:
        """
        Try multiple crop‐regions until we successfully read a page‐id in "N/M" format.
        """
        for bbox in config.PAGE_ID_CROPS:
            try:
                raw_text = page.crop(bbox).extract_text() or ""
                page_id_candidate = self.clean_page_id(raw_text.strip())
                if page_id_candidate:
                    return page_id_candidate
            except Exception:
                continue

        return ""

    def extract_headers_from_pages(
        self, pages: List[pdfplumber.page.Page]
    ) -> List[Dict[str, Optional[str]]]:
        """
        Loop over a list of `pdfplumber.page.Page` objects and build a list of header‐dicts:
          - page_id
          - account_name
          - account_number
          - period
          - total_withdrawal_transaction, total_withdrawal
          - total_deposit_transaction, total_deposit
        """
        header_records: List[Dict[str, Optional[str]]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = self.extract_page_id(page)
                full_text = page.extract_text() or ""
                footer_present = any(
                    kw.lower() in full_text.lower()
                    for kw in config.FOOTER_KEYWORDS_HEADER
                )
                has_header_info = any(
                    kw.lower() in full_text.lower()
                    for kw in ["ชื่อบัญชี", "Account No."]
                )

                header_record: Dict[str, Optional[str]] = {"page_id": page_id}

                # Extract each field from its bbox, if header keywords exist
                for field_name, bbox in config.HEADER_CROPS.items():
                    try:
                        if has_header_info:
                            raw_field_text = page.crop(bbox).extract_text() or ""
                            header_record[field_name] = raw_field_text.strip().replace("\n", " ")
                        else:
                            header_record[field_name] = None
                    except Exception:
                        header_record[field_name] = None

                # If footer is present, parse the totals lines
                if footer_present:
                    for line in full_text.splitlines():
                        if line.startswith(("Total Withdrawal", "รายการถอนเงิน")):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_record["total_withdrawal_transaction"] = (
                                numbers[0].replace(",", "") if len(numbers) > 0 else None
                            )
                            header_record["total_withdrawal"] = (
                                numbers[1].replace(",", "") if len(numbers) > 1 else None
                            )

                        elif line.startswith(("Total Deposit", "รายการฝากเงิน")):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_record["total_deposit_transaction"] = (
                                numbers[0].replace(",", "") if len(numbers) > 0 else None
                            )
                            header_record["total_deposit"] = (
                                numbers[1].replace(",", "") if len(numbers) > 1 else None
                            )

                header_records.append(header_record)

            except Exception:
                # Skip this page if anything goes wrong
                continue

        # After collecting all page‐headers, post‐process “period” into datetimes
        header_df = pd.DataFrame(header_records)
        if "period" in header_df.columns:
            working_copy = header_df.copy()
            period_series = working_copy["period"].fillna("").str.replace(" ", "")
            split_period = period_series.str.split(r"[-–]", n=1, expand=True)

            if split_period.shape[1] < 2:
                split_period[1] = None

            working_copy["start_period"] = pd.to_datetime(
                split_period[0], dayfirst=True, errors="coerce"
            )
            working_copy["end_period"] = pd.to_datetime(
                split_period[1], dayfirst=True, errors="coerce"
            )
            header_df = working_copy

        return header_df.to_dict(orient="records")

    def extract_transactions_from_pages(
        self, pages: List[pdfplumber.page.Page]
    ) -> List[Dict[str, Optional[str or float]]]:
        """
        Loop over each page, find table regions, group words by row, and build
        a list of transaction‐dicts with keys:
          page_id, date, time, code, channel, withdrawal, deposit, balance, description
        """
        transaction_records: List[Dict[str, Optional[str or float]]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = self.extract_page_id(page)

                # Find tables via pdfplumber (if any)
                try:
                    tables_on_page = page.find_tables(config.TABLE_SETTINGS)
                except Exception:
                    tables_on_page = []

                if tables_on_page:
                    regions = []
                    for table_obj in tables_on_page:
                        try:
                            regions.append(page.crop(table_obj.bbox))
                        except Exception:
                            pass
                else:
                    regions = [page]

                for region in regions:
                    try:
                        words = region.extract_words(use_text_flow=True)
                    except Exception:
                        continue

                    # Bucket words into rows by vertical position
                    row_buckets: Dict[int, List[dict]] = {}
                    for word in words:
                        row_key = int(word["top"] // config.Y_MARGIN)
                        row_buckets.setdefault(row_key, []).append(word)

                    # Check for any footer lines in those rows
                    footer_y_positions = [
                        min(w["top"] for w in one_row)
                        for one_row in row_buckets.values()
                        if any(
                            kw in " ".join(w["text"] for w in one_row)
                            for kw in config.TABLE_FOOTER_KEYWORDS
                        )
                    ]

                    if footer_y_positions:
                        cutoff = min(footer_y_positions) - config.TABLE_FOOTER_MARGIN
                        full_height = region.bbox[3] - region.bbox[1]
                        if 0 < cutoff < full_height:
                            try:
                                cropped_region = region.crop((0, 0, region.width, cutoff), relative=True)
                                words = cropped_region.extract_words(use_text_flow=True)
                            except Exception:
                                pass

                            # Re‐bucket after cropping
                            row_buckets.clear()
                            for word in words:
                                row_key = int(word["top"] // config.Y_MARGIN)
                                row_buckets.setdefault(row_key, []).append(word)

                    # Build “interval” y‐ranges from any date‐like words
                    sorted_tops = sorted(
                        w["top"]
                        for w in words
                        if self._date_regex.match(w["text"])
                        and config.DATE_COLUMN_X0 <= w["x0"] <= config.DATE_COLUMN_X1
                    )
                    if not sorted_tops:
                        continue

                    intervals: List[Tuple[float, float]] = []
                    for idx_top, y_val in enumerate(sorted_tops):
                        start_y = y_val - config.Y_MARGIN
                        if idx_top + 1 < len(sorted_tops):
                            next_y = sorted_tops[idx_top + 1]
                            end_y = next_y - config.Y_MARGIN
                        else:
                            previous_y = (
                                sorted_tops[idx_top - 1]
                                if idx_top > 0
                                else (y_val - 2 * config.Y_MARGIN)
                            )
                            end_y = y_val + (y_val - previous_y) - config.Y_MARGIN
                        intervals.append((start_y, end_y))

                    # Assign words to each interval (row)
                    rows_of_words: List[List[dict]] = [[] for _ in intervals]
                    for word in words:
                        for interval_index, (start_y, end_y) in enumerate(intervals):
                            if start_y <= word["top"] < end_y:
                                rows_of_words[interval_index].append(word)
                                break

                    # Parse each “row” into structured fields
                    for row_words in rows_of_words:
                        if not row_words:
                            continue

                        row_words_sorted = sorted(row_words, key=lambda w: (w["top"], w["x0"]))
                        row_text_combined = " ".join(w["text"] for w in row_words_sorted)

                        # Skip total lines
                        if any(lbl in row_text_combined for lbl in ("TOTAL AMOUNTS", "TOTAL ITEMS")):
                            continue

                        date_text = next(
                            (w["text"] for w in row_words_sorted if self._date_regex.match(w["text"])),
                            ""
                        )
                        time_text = next(
                            (w["text"] for w in row_words_sorted if self._time_regex.match(w["text"])),
                            ""
                        )

                        code_parts: List[str] = []
                        channel_parts: List[str] = []
                        description_parts: List[str] = []
                        dc_word_candidates: List[dict] = []
                        balance_word_candidates: List[dict] = []

                        for w in row_words_sorted:
                            text_token = w["text"]
                            x0 = w["x0"]

                            if self._date_regex.match(text_token) or self._time_regex.match(text_token):
                                continue

                            if self._money_regex.match(text_token):
                                # Is it debit/credit or balance?
                                if config.CHANNEL_DC_SPLIT_X <= x0 <= config.DC_BALANCE_SPLIT_X:
                                    dc_word_candidates.append(w)
                                elif config.DC_BALANCE_SPLIT_X <= x0 <= config.BALANCE_DESCRIPTION_SPLIT_X:
                                    balance_word_candidates.append(w)
                                continue

                            if x0 <= config.CODE_CHANNEL_SPLIT_X + config.X_TOLERANCE:
                                code_parts.append(text_token)
                            elif x0 <= config.CHANNEL_SPLIT_X + config.X_TOLERANCE:
                                channel_parts.append(text_token)
                            else:
                                description_parts.append(text_token)

                        withdrawal_value: Optional[float] = None
                        deposit_value: Optional[float] = None
                        for candidate in dc_word_candidates:
                            try:
                                numeric_val = float(candidate["text"].replace(",", ""))
                            except Exception:
                                numeric_val = None

                            if numeric_val is not None:
                                if candidate["x1"] <= config.WITHDRAW_DEPOSIT_SPLIT_X:
                                    withdrawal_value = numeric_val
                                else:
                                    deposit_value = numeric_val

                        balance_value: Optional[float] = None
                        if balance_word_candidates:
                            chosen = next(
                                (w for w in balance_word_candidates if w["x0"] >= config.DC_BALANCE_SPLIT_X),
                                None
                            )
                            if chosen:
                                try:
                                    balance_value = float(chosen["text"].replace(",", ""))
                                except Exception:
                                    balance_value = None

                        transaction_records.append({
                            "page_id":    page_id,
                            "date":       date_text,
                            "time":       time_text,
                            "code":       " ".join(code_parts),
                            "channel":    " ".join(channel_parts),
                            "withdrawal": withdrawal_value,
                            "deposit":    deposit_value,
                            "balance":    balance_value,
                            "description": " ".join(description_parts),
                        })

            except Exception:
                continue

        return transaction_records

    def clean_extracted_data(
        self,
        header_dataframe: pd.DataFrame,
        transaction_dataframe: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Standardize column names, rename withdrawal→debit, deposit→credit, 
        drop unused columns, fill NaNs, and cast to float where needed.
        """
        # Work on copies to avoid assignment-on-slice issues:
        header_copy = header_dataframe.copy()
        transaction_copy = transaction_dataframe.copy()

        # Select and rename in header: total_withdrawal→total_debit, etc.
        header_selected = header_copy[[
            "page_id", "account_name", "account_number", "period",
            "total_withdrawal", "total_deposit",
            "total_withdrawal_transaction", "total_deposit_transaction"
        ]].copy()

        header_selected = header_selected.rename(columns={
            "total_withdrawal":             "total_debit",
            "total_deposit":                "total_credit",
            "total_withdrawal_transaction": "total_debit_transaction",
            "total_deposit_transaction":    "total_credit_transaction"
        })

        # Rename in transactions: withdrawal→debit, deposit→credit
        transaction_selected = transaction_copy.copy()
        transaction_selected = transaction_selected.rename(columns={
            "withdrawal": "debit",
            "deposit":    "credit"
        })

        # Create transaction_type column from “code”, then drop “code”
        transaction_selected["transaction_type"] = transaction_selected["code"]
        transaction_selected["code"] = None

        # Clean page_id strings in both DataFrames
        header_selected["page_id"] = header_selected["page_id"].apply(self.clean_page_id)
        transaction_selected["page_id"] = transaction_selected["page_id"].apply(self.clean_page_id)

        # Fill missing with empty strings
        header_selected.fillna("", inplace=True)
        transaction_selected.fillna("", inplace=True)

        # Add empty “address” column
        header_selected["address"] = ""

        # Cast numeric columns to floats
        for col_name in ["debit", "credit", "balance"]:
            if col_name in transaction_selected.columns:
                transaction_selected[col_name] = self.clean_float_column(transaction_selected[col_name])

        for col_name in [
            "total_debit", "total_credit",
            "total_debit_transaction", "total_credit_transaction"
        ]:
            if col_name in header_selected.columns:
                header_selected[col_name] = self.clean_float_column(header_selected[col_name])

        return header_selected, transaction_selected

    def run(self, pdf_path: str, password: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Convenience method: open the PDF, extract headers & transactions,
        clean them, and return two DataFrames.
        """
        with pdfplumber.open(pdf_path, password=password) as pdf:
            pages = pdf.pages
            raw_header_records = self.extract_headers_from_pages(pages)
            raw_transaction_records = self.extract_transactions_from_pages(pages)

        header_df = pd.DataFrame(raw_header_records)
        transaction_df = pd.DataFrame(raw_transaction_records)
        cleaned_header_df, cleaned_transaction_df = self.clean_extracted_data(header_df, transaction_df)
        return cleaned_header_df, cleaned_transaction_df


In [10]:
import pdfplumber
import pandas as pdf # wherever you put the refactored class

PDF_PATH = "/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/BAY/Total Depositเลื่อน.pdf"
PASSWORD = None
with pdfplumber.open(PDF_PATH, password=PASSWORD) as pdf:
    pages = pdf.pages
    extractor = BayStatementExtractor()

    # 1) Get raw lists of dicts
    raw_header_records      = extractor.extract_headers_from_pages(pages)
    raw_transaction_records = extractor.extract_transactions_from_pages(pages)

# 2) Convert to DataFrames
header_dataframe      = pd.DataFrame(raw_header_records)
transaction_dataframe = pd.DataFrame(raw_transaction_records)

# 3) (Optionally) run cleaning step separately:
clean_headers, clean_transactions = extractor.clean_extracted_data(header_dataframe, transaction_dataframe)

print("--- Raw Headers DF ---")
print(header_dataframe.head())

print("\n--- Cleaned Headers DF ---")
print(clean_headers.head())

print("\n--- Raw Transactions DF ---")
print(transaction_dataframe.head())

print("\n--- Cleaned Transactions DF ---")
print(clean_transactions.head())


--- Raw Headers DF ---
  page_id           account_name account_number                   period  \
0     1/3  นาย สุพจน์ วัชรเสถียร  XXX-1-07840-X  28/02/2023 - 28/08/2023   
1     2/3                   None           None                     None   
2     3/3                   None           None                     None   

  total_withdrawal_transaction total_withdrawal total_deposit_transaction  \
0                          NaN              NaN                       NaN   
1                          NaN              NaN                       NaN   
2                           30        414882.00                        34   

  total_deposit start_period end_period  
0           NaN   2023-02-28 2023-08-28  
1           NaN          NaT        NaT  
2     414914.48          NaT        NaT  

--- Cleaned Headers DF ---
  page_id           account_name account_number                   period  \
0     1/3  นาย สุพจน์ วัชรเสถียร  XXX-1-07840-X  28/02/2023 - 28/08/2023   
1     2/3      

  transaction_selected.fillna("", inplace=True)


In [11]:
clean_headers

Unnamed: 0,page_id,account_name,account_number,period,total_debit,total_credit,total_debit_transaction,total_credit_transaction,address
0,1/3,นาย สุพจน์ วัชรเสถียร,XXX-1-07840-X,28/02/2023 - 28/08/2023,,,,,
1,2/3,,,,,,,,
2,3/3,,,,414882.0,414914.48,30.0,34.0,


In [12]:
clean_transactions

Unnamed: 0,page_id,date,time,code,channel,debit,credit,balance,description,transaction_type
0,1/3,07/03/2023,01:43:14,,OTHERS,,13464.0,13472.02,,รับโอนเงินเดือน
1,1/3,07/03/2023,12:15:42,,MOBILE,13400.0,,72.02,KBANK บัญชีปลายทาง : X439318,โอนเงิน
2,1/3,09/03/2023,01:41:11,,OTHERS,,900.0,972.02,,รับโอนเงินเดือน
3,1/3,10/03/2023,13:28:25,,MOBILE,570.0,,402.02,บริษัท เอี่ยวไถ่เอ็กซ์เพรส...,จ่ายบิล
4,1/3,10/03/2023,13:33:30,,MOBILE,174.0,,228.02,บริษัท เซ็นทรัล เรสตอรองส์...,จ่ายบิล
...,...,...,...,...,...,...,...,...,...,...
59,2/3,08/08/2023,16:18:06,,MOBILE,43800.0,,69.50,KBANK บัญชีปลายทาง : X439318,โอนเงิน
60,2/3,17/08/2023,01:42:46,,OTHERS,,2776.0,2845.50,,รับโอนเงินเดือน
61,2/3,18/08/2023,14:32:45,,MOBILE,2800.0,,45.50,KBANK บัญชีปลายทาง : X439318,โอนเงิน
62,2/3,25/08/2023,02:14:29,,OTHERS,,9095.0,9140.50,,รับโอนเงินเดือน


In [1]:
import os, traceback
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any, Optional
import pdfplumber
from bay_extractor import BAYStatementExtractor




INPUT_FOLDER = "/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/BAY"
PASSWORD: Optional[str] = None

def process_folder(input_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict]]:
    tx_list, hdr_list, failures = [], [], []
    for fn in os.listdir(input_folder):
        if not fn.lower().endswith(".pdf"):
            continue
        path = os.path.join(input_folder, fn)
        try:
            with pdfplumber.open(path, password=PASSWORD) as pdf:
                pages = pdf.pages
            # 3) (Optionally) run cleaning step separately:
                df_hdr, df_tx = BAYStatementExtractor().run(path,None)
                df_hdr["source_file"] = fn
                df_tx ["source_file"] = fn
                tx_list.append(df_tx)
                hdr_list.append(df_hdr)
        except Exception as e:
            failures.append({
                "file": fn,
                "error": str(e),
                "traceback": traceback.format_exc()
            })
    all_tx  = pd.concat(tx_list, ignore_index=True) if tx_list else pd.DataFrame()
    all_hdr = pd.concat(hdr_list, ignore_index=True) if hdr_list else pd.DataFrame()
    return all_tx, all_hdr, failures

def validate_bbl(df_tx_all: pd.DataFrame, df_hdr_all: pd.DataFrame) -> pd.DataFrame:
    # STEP 1: ensure numeric
    tx_clean = df_tx_all.copy()
    tx_clean[['debit','credit']] = (
        tx_clean[['debit','credit']]
        .replace('', np.nan)
        .astype(float)
    )

    # STEP 2: sums per file
    sums = (tx_clean
            .groupby('source_file')[['debit','credit']]
            .sum(min_count=1)
            .rename(columns={
                'debit':'sum_debit',
                'credit'   :'sum_credit'
            }))

    # STEP 3: counts per file
    counts = (tx_clean
              .groupby('source_file')[['debit','credit']]
              .count()
              .rename(columns={
                  'debit':'count_debit_tx',
                  'credit'   :'count_credit_tx'
              }))

    # STEP 4: pick header summary columns (drop pages without both)
    # note: we sum the two *_transaction fields into total_txns
    hdr = (df_hdr_all
           .dropna(subset=['total_debit_transaction','total_credit_transaction'])
           .groupby('source_file')
           .agg({
               'total_debit_transaction':'sum',
               'total_debit'            :'sum',
               'total_credit_transaction'   :'sum',
               'total_credit'               :'sum'
           })
          )
    hdr = hdr.rename(columns={
        'total_debit_transaction':'total_debit_txns',
        'total_debit'            :'total_debit',
        'total_credit_transaction'   :'total_credit_txns',
        'total_credit'               :'total_credit'
    })


    # STEP 5: merge & compare
    cmp = (hdr
           .join(sums,   how='left')
           .join(counts, how='left')
           .reset_index()
           .rename(columns={'source_file':'file'}))

    summary = cmp.assign(
        debit_amount_match = lambda d: np.isclose(d['total_debit'], d['sum_debit'], atol=1e-2),
        credit_amount_match    = lambda d: np.isclose(d['total_credit'],    d['sum_credit'],    atol=1e-2),
        transaction_count_debit_match = lambda d: d['total_debit_txns'] == (d['count_debit_tx']),
        transaction_count_credit_match = lambda d: d['total_credit_txns'] == (d['count_credit_tx'])
        
    )[
        ['file',
         'total_debit','sum_debit','debit_amount_match',
         'total_credit',   'sum_credit',   'credit_amount_match',
         'total_credit_txns','total_debit_txns',      'count_debit_tx','count_credit_tx','transaction_count_debit_match','transaction_count_credit_match']
    ]

    return summary

if __name__ == "__main__":
    df_tx_all, df_hdr_all, failures = process_folder(INPUT_FOLDER)
    print(f"Processed transactions: {df_tx_all.shape}, headers: {df_hdr_all.shape}")
    if failures:
        print(f"\n⚠️ {len(failures)} failures; inspect `failures` list.")
    if df_tx_all.empty or df_hdr_all.empty:
        print("No data to validate; exiting.")
        exit(1)
    print(df_tx_all,df_hdr_all)
    summary = validate_bbl(df_tx_all, df_hdr_all)
    print("\n--- Validation Summary per File ---")
    print(summary.to_string(index=False))

    bad = summary.loc[~(summary.debit_amount_match
                        & summary.credit_amount_match
                        & summary.transaction_count_debit_match
                        & summary.transaction_count_credit_match)]
    if not bad.empty:
        bad = bad.assign(
            diff_debit = bad['sum_debit'] - bad['total_debit'],
            diff_credit    = bad['sum_credit']    - bad['total_credit']
        )
        print("\n❌ Files with mismatches:")
        print(bad.to_string(index=False))
    else:
        print("\n✅ All files validated successfully!")


  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplace=True)
  transaction_selected.fillna("", inplac

Processed transactions: (81797, 11), headers: (2500, 10)
      page_id        date      time code channel   debit  credit  balance  \
0         1/3  19/05/2023  08:29:16          ATM  1000.0     NaN   989.94   
1         1/3  20/05/2023  22:06:36       MOBILE   500.0     NaN   489.94   
2         1/3  21/05/2023  16:55:43       MOBILE   310.0     NaN   179.94   
3         1/3  21/05/2023  17:01:07       MOBILE   150.0     NaN    29.94   
4         1/3  26/05/2023  08:32:09       MOBILE    29.0     NaN     0.94   
...       ...         ...       ...  ...     ...     ...     ...      ...   
81792   10/11  09/08/2023  19:50:40       MOBILE     NaN  1000.0  3652.87   
81793   11/11  09/08/2023  19:52:21       MOBILE  2730.0     NaN   922.87   
81794   11/11  10/08/2023  06:27:41       MOBILE     NaN   400.0  1322.87   
81795   11/11  10/08/2023  06:27:59       MOBILE  1300.0     NaN    22.87   
81796   11/11  11/08/2023  23:04:31       MOBILE    22.0     NaN     0.87   

                  

  transaction_selected.fillna("", inplace=True)


In [2]:
summary

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match
0,108988-02009226-2566_1_BAY.pdf,671372.56,671372.56,True,670826.63,670826.63,True,49.0,335.0,335,49,True,True
1,108988-02009227-2566_1_BAY.pdf,2983798.73,2983798.73,True,2984845.09,2984845.09,True,544.0,1480.0,1480,544,True,True
2,108988-02009465-2566_1_BAY.pdf,176964.54,176964.54,True,177472.22,177472.22,True,70.0,217.0,217,70,True,True
3,108988-02010141-2566_1_BAY.pdf,146897.59,146897.59,True,139449.87,139449.87,True,26.0,117.0,117,26,True,True
4,108988-02010379-2566_1_BAY.pdf,41610.00,41610.00,True,41610.08,41610.08,True,4.0,5.0,5,4,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,มีสแกน3.pdf,167765.50,167765.50,True,161066.59,161066.59,True,21.0,118.0,118,21,True,True
292,มีสแกน990.pdf,234281.40,234281.40,True,234281.05,234281.05,True,25.0,48.0,48,25,True,True
293,มีหนังสือรับรองเงินเดือน.pdf,113546.18,113546.18,True,133308.54,133308.54,True,13.0,91.0,91,13,True,True
294,หน้าหายไป1หน้า.pdf,505328.17,441105.69,False,504327.17,449897.17,False,66.0,259.0,231,57,False,False


In [3]:
bad

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match,diff_debit,diff_credit
294,หน้าหายไป1หน้า.pdf,505328.17,441105.69,False,504327.17,449897.17,False,66.0,259.0,231,57,False,False,-64222.48,-54430.0


In [16]:
failures

[]