In [None]:
# File: extractor.py

import pdfplumber
import pandas as pd
from typing import List, Dict, Tuple, Optional

import config  # Contains all constants, regexes, and bounding boxes


class SCBStatementExtractor:
    """
    SCBDataExtractor encapsulates the pipeline to extract:
      1. Header data from each page
      2. Transaction rows from table regions on each page

    Now, extract_scb_data() expects a list of pdfplumber.page.Page objects (pages),
    rather than opening the PDF internally.
    """

    def __init__(self):
        # No longer storing pdf_path or password here, since pages are provided externally
        pass

    # ─── HELPER METHODS ─────────────────────────────────────────────────

    @staticmethod
    def _compute_date_top_coordinates(word_list: List[dict]) -> List[float]:
        """
        Finds all Y-coordinates of words matching the date pattern
        that fall within the configured X-range—used to detect row starts.
        """
        date_tops = sorted(
            word["top"]
            for word in word_list
            if config.DATE_PATTERN.match(word["text"])
            and config.DATE_X0 <= word["x0"] <= config.DATE_X1
        )
        return date_tops

    @staticmethod
    def _compute_row_intervals(date_tops: List[float]) -> List[Tuple[float, float]]:
        """
        Converts each date Y-coordinate into a (start, end) interval on the Y-axis,
        with margins applied, so words can be grouped into logical rows.
        """
        intervals: List[Tuple[float, float]] = []
        for index, y_coord in enumerate(date_tops):
            start_y = y_coord - config.Y_MARGIN
            if index + 1 < len(date_tops):
                end_y = date_tops[index + 1] - config.Y_MARGIN
            else:
                previous_gap = (y_coord - date_tops[index - 1]) if index > 0 else config.Y_MARGIN * 2
                end_y = y_coord + previous_gap - config.Y_MARGIN
            intervals.append((start_y, end_y))
        return intervals

    @staticmethod
    def _assign_words_to_rows(word_list: List[dict], row_intervals: List[Tuple[float, float]]) -> List[List[dict]]:
        """
        Given a list of words (each has 'top' and 'x0'), and a list of (start, end) Y-intervals,
        returns a list of lists, where each sublist contains all words whose 'top' falls in that interval.
        """
        grouped_rows: List[List[dict]] = [[] for _ in row_intervals]
        for word in word_list:
            top_y = word["top"]
            for idx, (start_y, end_y) in enumerate(row_intervals):
                if start_y <= top_y < end_y:
                    grouped_rows[idx].append(word)
                    break
        return grouped_rows

    @staticmethod
    def _contains_any_keyword(text: str, keyword_list: List[str]) -> bool:
        """
        Returns True if any of the case-insensitive keywords is found in the text.
        """
        import re
        return any(re.search(re.escape(keyword), text, re.IGNORECASE) for keyword in keyword_list)

    @staticmethod
    def _group_words_by_row(word_list: List[dict], margin: float) -> Dict[int, List[dict]]:
        """
        Groups words by their integer row index (word['top'] // margin).
        Useful for footer detection.
        """
        rows_by_key: Dict[int, List[dict]] = {}
        for word in word_list:
            key = int(word["top"] // margin)
            rows_by_key.setdefault(key, []).append(word)
        return rows_by_key

    @staticmethod
    def _clean_dataframes(
        transaction_dataframe: pd.DataFrame,
        header_dataframe: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Standardizes column names, drops unused columns, cleans strings, and ensures no NaNs.
        ALWAYS uses .copy() when slicing to avoid SettingWithCopyWarning.
        """
        # Clean header DataFrame
        if not header_dataframe.empty:
            header_dataframe = header_dataframe[
                [
                    "page_id",
                    "account_name",
                    "address",
                    "account_number",
                    "period",
                    "total_withdrawal_summary",
                    "total_deposit_summary",
                    "total_withdrawal_transaction_summary",
                    "total_deposit_transaction_summary"
                ]
            ].copy()  # <-- use .copy() on slice

            header_dataframe["address"] = (
                header_dataframe["address"]
                .str.replace("\n", "", regex=False)
                .str.strip()
            )

            header_dataframe = header_dataframe.rename(
                columns={
                    "total_withdrawal_summary": "total_debit",
                    "total_deposit_summary": "total_credit",
                    "total_withdrawal_transaction_summary": "total_debit_transaction",
                    "total_deposit_transaction_summary": "total_credit_transaction"
                }
            )

            header_dataframe.fillna("", inplace=True)

            header_dataframe = header_dataframe[
                [
                    "page_id",
                    "account_name",
                    "account_number",
                    "period",
                    "total_debit",
                    "total_credit",
                    "total_debit_transaction",
                    "total_credit_transaction",
                    "address"
                ]
            ].copy()

        # Clean transaction DataFrame
        if not transaction_dataframe.empty:
            # Rename withdrawal/deposit → debit/credit
            transaction_dataframe = transaction_dataframe.rename(
                columns={
                    "withdrawal": "debit",
                    "deposit": "credit"
                }
            )

            # Convert debit/credit to numeric (float), coercing invalids to NaN
            transaction_dataframe["debit"] = pd.to_numeric(
                transaction_dataframe["debit"], errors="coerce"
            )
            transaction_dataframe["credit"] = pd.to_numeric(
                transaction_dataframe["credit"], errors="coerce"
            )
            # Convert balance to numeric (float), coercing invalids to NaN
            transaction_dataframe["balance"] = pd.to_numeric(
                transaction_dataframe["balance"], errors="coerce"
            )

            # If you’d rather replace NaNs with 0.0, uncomment the next three lines:
            # transaction_dataframe["debit"].fillna(0.0, inplace=True)
            # transaction_dataframe["credit"].fillna(0.0, inplace=True)
            # transaction_dataframe["balance"].fillna(0.0, inplace=True)

            transaction_dataframe["transaction_type"] = ""  # Placeholder for future logic
            #transaction_dataframe.fillna("", inplace=True)

        return transaction_dataframe, header_dataframe

    # ─── MAIN EXTRACTION METHOD ────────────────────────────────────────

    def extract_scb_data(
        self,
        pages: List[pdfplumber.page.Page]
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Expects a list of pdfplumber.page.Page objects (e.g., pages = pdf.pages).
        Iterates through each page:
          1. Extracts header fields via predefined bounding boxes.
          2. Finds table regions, splits into words, groups into rows, and parses each row.
        Returns two DataFrames: (transactions_df, headers_df).
        """
        transaction_records_list: List[Dict] = []
        header_records_list: List[Dict] = []

        for page_index, pdf_page in enumerate(pages):
            try:
                full_page_text = pdf_page.extract_text() or ""
                page_id_match = config.PAGE_ID_PATTERN.search(full_page_text)
                page_identifier = (
                    f"{page_id_match.group(1)}/{page_id_match.group(2)}"
                    if page_id_match
                    else None
                )

                # ─── HEADER EXTRACTION ──────────────────────────────────
                header_dict: Dict[str, Optional[object]] = {"page_id": page_identifier}
                has_credit_total = self._contains_any_keyword(
                    full_page_text, ["TOTAL AMOUNTS (Credit)"]
                )

                for field_name, bounding_box in config.HEADER_CROP_REGIONS.items():
                    cropped_region = pdf_page.crop(bounding_box)
                    extracted_text = (cropped_region.extract_text() or "").strip()

                    if field_name.endswith("_summary"):
                        # Only capture summary if page actually has credit totals
                        if has_credit_total:
                            import re
                            money_match = re.search(r"[\d,]+(?:\.\d{2})?", extracted_text)
                            if money_match:
                                header_dict[field_name] = float(
                                    money_match.group().replace(",", "")
                                )
                            else:
                                header_dict[field_name] = None
                        else:
                            header_dict[field_name] = None
                    else:
                        header_dict[field_name] = extracted_text

                header_records_list.append(header_dict)

                # ─── TRANSACTION EXTRACTION ─────────────────────────────
                tables_on_page = pdf_page.find_tables(config.TABLE_SETTINGS)
                regions_to_parse = [
                    pdf_page.crop(table.bbox) for table in tables_on_page
                ] if tables_on_page else [pdf_page]

                for region in regions_to_parse:
                    word_list = region.extract_words(use_text_flow=True)

                    # Attempt footer removal (skip everything under "TOTAL AMOUNTS")
                    footer_y_coordinates = []
                    grouped_rows_for_footer = self._group_words_by_row(
                        word_list, config.Y_MARGIN
                    )
                    for _, words_in_row in grouped_rows_for_footer.items():
                        if any("TOTAL AMOUNTS" in w["text"] for w in words_in_row):
                            minimal_y = min(w["top"] for w in words_in_row)
                            footer_y_coordinates.append(minimal_y)

                    if footer_y_coordinates:
                        cutoff_y = min(footer_y_coordinates) - config.Y_MARGIN
                        region_width = region.bbox[2]  # x1 coordinate = width
                        region = region.crop((0, 0, region_width, cutoff_y), relative=True)
                        word_list = region.extract_words(use_text_flow=True)

                    date_top_coordinates = self._compute_date_top_coordinates(word_list)
                    if not date_top_coordinates:
                        continue  # No rows found here

                    row_intervals = self._compute_row_intervals(date_top_coordinates)
                    rows_of_words = self._assign_words_to_rows(word_list, row_intervals)

                    for single_row in rows_of_words:
                        if not single_row:
                            continue  # skip empty row

                        combined_row_text = " ".join(w["text"] for w in single_row)
                        if any(keyword in combined_row_text for keyword in ("TOTAL AMOUNTS", "TOTAL ITEMS")):
                            continue  # skip summary/footer rows

                        # Sort words top→bottom, then left→right
                        sorted_row = sorted(single_row, key=lambda w: (w["top"], w["x0"]))

                        # Extract date and time tokens
                        date_text = ""
                        time_text = ""
                        for word in sorted_row:
                            text_token = word["text"]
                            x0_coordinate = word["x0"]
                            if config.DATE_PATTERN.match(text_token) and config.DATE_X0 <= x0_coordinate <= config.DATE_X1:
                                date_text = text_token
                            elif config.TIME_PATTERN.match(text_token) and x0_coordinate > config.DATE_X1:
                                time_text = text_token

                        # Containers for code, channel, money words, balance, description
                        code_tokens: List[str] = []
                        channel_tokens: List[str] = []
                        debit_credit_word_objects: List[dict] = []
                        balance_word_objects: List[dict] = []
                        description_tokens: List[str] = []

                        # Populate containers by inspecting each word
                        for word in sorted_row:
                            text_token = word["text"]
                            x0_coordinate = word["x0"]

                            # Skip date/time tokens once captured
                            if config.DATE_PATTERN.match(text_token) or config.TIME_PATTERN.match(text_token):
                                continue

                            if config.MONEY_PATTERN.match(text_token):
                                # Monetary field: either debit/credit or balance
                                if x0_coordinate <= config.X_SPLIT_CHANNEL_DEBIT_CREDIT + config.X_TOLERANCE:
                                    debit_credit_word_objects.append(word)
                                elif x0_coordinate <= config.X_SPLIT_BALANCE_DESCRIPTION + config.X_TOLERANCE:
                                    balance_word_objects.append(word)
                                continue

                            if x0_coordinate <= config.X_SPLIT_CODE_CHANNEL + config.X_TOLERANCE:
                                code_tokens.append(text_token)
                            elif x0_coordinate <= config.X_SPLIT_CHANNEL_DEBIT_CREDIT + config.X_TOLERANCE:
                                channel_tokens.append(text_token)
                            else:
                                description_tokens.append(text_token)

                        # Convert debit/credit monetary words into numeric values
                        withdrawal_amount = None
                        deposit_amount = None
                        for money_word in debit_credit_word_objects:
                            numeric_value = float(money_word["text"].replace(",", ""))
                            if money_word["x1"] <= config.X_SPLIT_WITHDRAWAL_DEPOSIT + config.X_TOLERANCE:
                                withdrawal_amount = numeric_value
                            else:
                                deposit_amount = numeric_value

                        # Convert balance monetary words into a single float
                        balance_amount = None
                        for money_word in balance_word_objects:
                            if config.MONEY_PATTERN.match(money_word["text"]):
                                balance_amount = float(money_word["text"].replace(",", ""))
                                break

                        transaction_records_list.append({
                            "page_id": page_identifier,
                            "date": pd.to_datetime(
                                date_text, format="%d/%m/%y", dayfirst=True, errors="coerce"
                            ),
                            "time": time_text,
                            "code": " ".join(code_tokens),
                            "channel": " ".join(channel_tokens),
                            "withdrawal": withdrawal_amount,
                            "deposit": deposit_amount,
                            "balance": balance_amount,
                            "description": " ".join(description_tokens)
                        })

            except Exception as extraction_error:
                print(f"[Page {page_index + 1}] Extraction failed: {extraction_error}")

        # After iterating all pages, build DataFrames:
        df_transactions = pd.DataFrame(transaction_records_list)
        df_headers = pd.DataFrame(header_records_list)

        # Clean and standardize before returning
        df_transactions_cleaned, df_headers_cleaned = self._clean_dataframes(df_transactions, df_headers)
        return df_transactions_cleaned, df_headers_cleaned


# ─── USAGE EXAMPLE ─────────────────────────────────────────────────
if __name__ == "__main__":
        extractor = SCBDataExtractor()
        PDF_PATH = '/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/SCB/no_note/108988-02031584-2566_1_SCB.pdf'
        # ─── KEEP THE PDF OPEN WHILE PROCESSING ──────────────────────────
        with pdfplumber.open(PDF_PATH, password=config.PASSWORD) as pdf:
            pages = pdf.pages
            df_transactions, df_headers = extractor.extract_scb_data(pages)

        # Now pdf is closed, but you already extracted everything.
        print(df_headers.head())
        print(df_transactions.head())



  page_id               account_name account_number                   period  \
0          นางสาว มลพิทักษ์ สมรฤทธิ์                 01/07/2023 - 31/07/2023   
1          นางสาว มลพิทักษ์ สมรฤทธิ์                 01/07/2023 - 31/07/2023   
2          นางสาว มลพิทักษ์ สมรฤทธิ์                 01/07/2023 - 31/07/2023   
3          นางสาว มลพิทักษ์ สมรฤทธิ์                 01/07/2023 - 31/07/2023   
4          นางสาว มลพิทักษ์ สมรฤทธิ์                 01/07/2023 - 31/07/2023   

  total_debit total_credit total_debit_transaction total_credit_transaction  \
0                                                                             
1                                                                             
2                                                                             
3                                                                             
4   120930.07    119125.98                    77.0                     28.0   

                                            

  header_dataframe.fillna("", inplace=True)


In [None]:
# File: extractor.py

import pdfplumber
import pandas as pd
from typing import List, Dict, Tuple, Optional

import config  # Contains all constants, regexes, and bounding boxes


class SCBStatementExtractor:
    """
    SCBStatementExtractor encapsulates the pipeline to extract:
      1. Header data from each page
      2. Transaction rows from table regions on each page

    All helper methods and the main extraction logic are static.
    Only `run()` is an instance method that opens the PDF for you.
    """

    def __init__(self):
        # No instance‐level state is needed beyond run()
        pass

    # ─── HELPER METHODS ─────────────────────────────────────────────────

    @staticmethod
    def _compute_date_top_coordinates(word_list: List[dict]) -> List[float]:
        """
        Finds all Y-coordinates of words matching the date pattern
        that fall within the configured X-range—used to detect row starts.
        """
        date_tops = sorted(
            word["top"]
            for word in word_list
            if config.DATE_PATTERN.match(word["text"])
            and config.DATE_X0 <= word["x0"] <= config.DATE_X1
        )
        return date_tops

    @staticmethod
    def _compute_row_intervals(date_tops: List[float]) -> List[Tuple[float, float]]:
        """
        Converts each date Y-coordinate into a (start, end) interval on the Y-axis,
        with margins applied, so words can be grouped into logical rows.
        """
        intervals: List[Tuple[float, float]] = []
        for index, y_coord in enumerate(date_tops):
            start_y = y_coord - config.Y_MARGIN
            if index + 1 < len(date_tops):
                end_y = date_tops[index + 1] - config.Y_MARGIN
            else:
                previous_gap = (
                    (y_coord - date_tops[index - 1]) if index > 0 else config.Y_MARGIN * 2
                )
                end_y = y_coord + previous_gap - config.Y_MARGIN
            intervals.append((start_y, end_y))
        return intervals

    @staticmethod
    def _assign_words_to_rows(
        word_list: List[dict], row_intervals: List[Tuple[float, float]]
    ) -> List[List[dict]]:
        """
        Given a list of words (each has 'top' and 'x0'), and a list of (start, end) Y-intervals,
        returns a list of lists, where each sublist contains all words whose 'top' falls in that interval.
        """
        grouped_rows: List[List[dict]] = [[] for _ in row_intervals]
        for word in word_list:
            top_y = word["top"]
            for idx, (start_y, end_y) in enumerate(row_intervals):
                if start_y <= top_y < end_y:
                    grouped_rows[idx].append(word)
                    break
        return grouped_rows

    @staticmethod
    def _contains_any_keyword(text: str, keyword_list: List[str]) -> bool:
        """
        Returns True if any of the case-insensitive keywords is found in the text.
        """
        import re

        return any(
            re.search(re.escape(keyword), text, re.IGNORECASE)
            for keyword in keyword_list
        )

    @staticmethod
    def _group_words_by_row(word_list: List[dict], margin: float) -> Dict[int, List[dict]]:
        """
        Groups words by their integer row index (word['top'] // margin).
        Useful for footer detection.
        """
        rows_by_key: Dict[int, List[dict]] = {}
        for word in word_list:
            key = int(word["top"] // margin)
            rows_by_key.setdefault(key, []).append(word)
        return rows_by_key

    @staticmethod
    def _clean_dataframes(
        transaction_dataframe: pd.DataFrame, header_dataframe: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Standardizes column names, drops unused columns, cleans strings, and ensures no NaNs.
        ALWAYS uses .copy() when slicing to avoid SettingWithCopyWarning.
        """
        # ─── Clean header DataFrame ─────────────────────────────────
        if not header_dataframe.empty:
            header_dataframe = header_dataframe[
                [
                    "page_id",
                    "account_name",
                    "address",
                    "account_number",
                    "period",
                    "total_withdrawal_summary",
                    "total_deposit_summary",
                    "total_withdrawal_transaction_summary",
                    "total_deposit_transaction_summary",
                ]
            ].copy()

            header_dataframe["address"] = (
                header_dataframe["address"]
                .str.replace("\n", "", regex=False)
                .str.strip()
            )

            header_dataframe = header_dataframe.rename(
                columns={
                    "total_withdrawal_summary": "total_debit",
                    "total_deposit_summary": "total_credit",
                    "total_withdrawal_transaction_summary": "total_debit_transaction",
                    "total_deposit_transaction_summary": "total_credit_transaction",
                }
            )

            header_dataframe.fillna("", inplace=True)

            header_dataframe = header_dataframe[
                [
                    "page_id",
                    "account_name",
                    "account_number",
                    "period",
                    "total_debit",
                    "total_credit",
                    "total_debit_transaction",
                    "total_credit_transaction",
                    "address",
                ]
            ].copy()

        # ─── Clean transaction DataFrame ─────────────────────────────
        if not transaction_dataframe.empty:
            # Rename withdrawal/deposit → debit/credit
            transaction_dataframe = transaction_dataframe.rename(
                columns={"withdrawal": "debit", "deposit": "credit"}
            )

            # Convert debit/credit to numeric (float), coercing invalids to NaN
            transaction_dataframe["debit"] = pd.to_numeric(
                transaction_dataframe["debit"], errors="coerce"
            )
            transaction_dataframe["credit"] = pd.to_numeric(
                transaction_dataframe["credit"], errors="coerce"
            )
            # Convert balance to numeric (float), coercing invalids to NaN
            transaction_dataframe["balance"] = pd.to_numeric(
                transaction_dataframe["balance"], errors="coerce"
            )

            # Placeholder column (can be filled later if needed)
            transaction_dataframe["transaction_type"] = ""

        return transaction_dataframe, header_dataframe

    # ─── MAIN EXTRACTION LOGIC (STATIC) ─────────────────────────────────

    @staticmethod
    def extract_scb_data(
        pages: List[pdfplumber.page.Page]
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Expects a list of pdfplumber.page.Page objects (e.g., pages = pdf.pages).
        Iterates through each page:
          1. Extracts header fields via predefined bounding boxes.
          2. Finds table regions, splits into words, groups into rows, and parses each row.
        Returns two DataFrames: (transactions_df, headers_df).
        """
        transaction_records_list: List[Dict] = []
        header_records_list: List[Dict] = []

        for page_index, pdf_page in enumerate(pages):
            try:
                full_page_text = pdf_page.extract_text() or ""
                page_id_match = config.PAGE_ID_PATTERN.search(full_page_text)
                page_identifier = (
                    f"{page_id_match.group(1)}/{page_id_match.group(2)}"
                    if page_id_match
                    else None
                )

                # ─── HEADER EXTRACTION ──────────────────────────────────
                header_dict: Dict[str, Optional[object]] = {"page_id": page_identifier}
                has_credit_total = SCBStatementExtractor._contains_any_keyword(
                    full_page_text, ["TOTAL AMOUNTS (Credit)"]
                )

                for field_name, bounding_box in config.HEADER_CROP_REGIONS.items():
                    cropped_region = pdf_page.crop(bounding_box)
                    extracted_text = (cropped_region.extract_text() or "").strip()

                    if field_name.endswith("_summary"):
                        # Only capture summary if page actually has credit totals
                        if has_credit_total:
                            import re

                            money_match = re.search(
                                r"[\d,]+(?:\.\d{2})?", extracted_text
                            )
                            if money_match:
                                header_dict[field_name] = float(
                                    money_match.group().replace(",", "")
                                )
                            else:
                                header_dict[field_name] = None
                        else:
                            header_dict[field_name] = None
                    else:
                        header_dict[field_name] = extracted_text

                header_records_list.append(header_dict)

                # ─── TRANSACTION EXTRACTION ─────────────────────────────
                tables_on_page = pdf_page.find_tables(config.TABLE_SETTINGS)
                if tables_on_page:
                    regions_to_parse = [pdf_page.crop(t.bbox) for t in tables_on_page]
                else:
                    regions_to_parse = [pdf_page]

                for region in regions_to_parse:
                    word_list = region.extract_words(use_text_flow=True)

                    # Attempt footer removal (skip everything under "TOTAL AMOUNTS")
                    footer_y_coordinates = []
                    grouped_rows_for_footer = SCBStatementExtractor._group_words_by_row(
                        word_list, config.Y_MARGIN
                    )
                    for _, words_in_row in grouped_rows_for_footer.items():
                        if any("TOTAL AMOUNTS" in w["text"] for w in words_in_row):
                            minimal_y = min(w["top"] for w in words_in_row)
                            footer_y_coordinates.append(minimal_y)

                    if footer_y_coordinates:
                        cutoff_y = min(footer_y_coordinates) - config.Y_MARGIN
                        region_width = region.bbox[2]  # x1 coordinate = width
                        region = region.crop((0, 0, region_width, cutoff_y), relative=True)
                        word_list = region.extract_words(use_text_flow=True)

                    date_top_coordinates = SCBStatementExtractor._compute_date_top_coordinates(
                        word_list
                    )
                    if not date_top_coordinates:
                        continue  # No rows found here

                    row_intervals = SCBStatementExtractor._compute_row_intervals(
                        date_top_coordinates
                    )
                    rows_of_words = SCBStatementExtractor._assign_words_to_rows(
                        word_list, row_intervals
                    )

                    for single_row in rows_of_words:
                        if not single_row:
                            continue  # skip empty row

                        combined_row_text = " ".join(w["text"] for w in single_row)
                        if any(keyword in combined_row_text for keyword in ("TOTAL AMOUNTS", "TOTAL ITEMS")):
                            continue  # skip summary/footer rows

                        # Sort words top→bottom, then left→right
                        sorted_row = sorted(single_row, key=lambda w: (w["top"], w["x0"]))

                        # Extract date and time tokens
                        date_text = ""
                        time_text = ""
                        for word in sorted_row:
                            text_token = word["text"]
                            x0_coordinate = word["x0"]
                            if config.DATE_PATTERN.match(text_token) and config.DATE_X0 <= x0_coordinate <= config.DATE_X1:
                                date_text = text_token
                            elif config.TIME_PATTERN.match(text_token) and x0_coordinate > config.DATE_X1:
                                time_text = text_token

                        # Containers for code, channel, money words, balance, description
                        code_tokens: List[str] = []
                        channel_tokens: List[str] = []
                        debit_credit_word_objects: List[dict] = []
                        balance_word_objects: List[dict] = []
                        description_tokens: List[str] = []

                        # Populate containers by inspecting each word
                        for word in sorted_row:
                            text_token = word["text"]
                            x0_coordinate = word["x0"]

                            # Skip date/time tokens once captured
                            if config.DATE_PATTERN.match(text_token) or config.TIME_PATTERN.match(text_token):
                                continue

                            if config.MONEY_PATTERN.match(text_token):
                                # Monetary field: either debit/credit or balance
                                if x0_coordinate <= config.X_SPLIT_CHANNEL_DEBIT_CREDIT + config.X_TOLERANCE:
                                    debit_credit_word_objects.append(word)
                                elif x0_coordinate <= config.X_SPLIT_BALANCE_DESCRIPTION + config.X_TOLERANCE:
                                    balance_word_objects.append(word)
                                continue

                            if x0_coordinate <= config.X_SPLIT_CODE_CHANNEL + config.X_TOLERANCE:
                                code_tokens.append(text_token)
                            elif x0_coordinate <= config.X_SPLIT_CHANNEL_DEBIT_CREDIT + config.X_TOLERANCE:
                                channel_tokens.append(text_token)
                            else:
                                description_tokens.append(text_token)

                        # Convert debit/credit monetary words into numeric values
                        withdrawal_amount = None
                        deposit_amount = None
                        for money_word in debit_credit_word_objects:
                            numeric_value = float(money_word["text"].replace(",", ""))
                            if money_word["x1"] <= config.X_SPLIT_WITHDRAWAL_DEPOSIT + config.X_TOLERANCE:
                                withdrawal_amount = numeric_value
                            else:
                                deposit_amount = numeric_value

                        # Convert balance monetary words into a single float
                        balance_amount = None
                        for money_word in balance_word_objects:
                            if config.MONEY_PATTERN.match(money_word["text"]):
                                balance_amount = float(money_word["text"].replace(",", ""))
                                break

                        transaction_records_list.append(
                            {
                                "page_id": page_identifier,
                                "date": pd.to_datetime(
                                    date_text,
                                    format="%d/%m/%y",
                                    dayfirst=True,
                                    errors="coerce",
                                ),
                                "time": time_text,
                                "code": " ".join(code_tokens),
                                "channel": " ".join(channel_tokens),
                                "withdrawal": withdrawal_amount,
                                "deposit": deposit_amount,
                                "balance": balance_amount,
                                "description": " ".join(description_tokens),
                            }
                        )

            except Exception as extraction_error:
                # Print a warning but continue with the next page
                print(f"[Page {page_index + 1}] Extraction failed: {extraction_error}")

        # After iterating through all pages, build DataFrames:
        df_transactions = pd.DataFrame(transaction_records_list)
        df_headers = pd.DataFrame(header_records_list)

        # Clean and standardize before returning
        df_transactions_cleaned, df_headers_cleaned = SCBStatementExtractor._clean_dataframes(
            df_transactions, df_headers
        )
        return df_transactions_cleaned, df_headers_cleaned

    # ─── RUN METHOD (INSTANCE) ────────────────────────────────────────

    def run(
        self, pdf_path: str, password: Optional[str] = None
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Opens the PDF at `pdf_path` with optional `password`. Keeps it open
        while extracting, then closes it automatically. Returns:
          (transactions_df, headers_df)
        """
        with pdfplumber.open(pdf_path, password=password) as pdf:
            pages = pdf.pages
            return SCBStatementExtractor.extract_scb_data(pages)


# ─── USAGE EXAMPLE ─────────────────────────────────────────────────

if __name__ == "__main__":
    extractor = SCBStatementExtractor()
    PDF_PATH = "/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/SCB/no_note/108988-02031584-2566_1_SCB.pdf"
    df_transactions, df_headers = extractor.run(PDF_PATH, password=config.PASSWORD)

    print("Headers:")
    print(df_headers.head())
    print("\nTransactions:")
    print(df_transactions.head())


In [2]:
df_transactions.head(50)

Unnamed: 0,page_id,date,time,code,channel,debit,credit,balance,description,transaction_type
0,1/7,2023-05-01,07:53,X1,ENET,,23500.0,23500.54,PromptPay x0990 นางสาว ปราณี ดานขุนทด,
1,1/7,2023-05-01,08:00,X2,ENET,3000.0,,20500.54,โอนไป BAY x7259 นาย วีระพงษ์ สมสี,
2,1/7,2023-05-01,11:40,X2,ENET,20000.0,,500.54,โอนไป BAY x7259 นาย วีระพงษ์ สมสี,
3,1/7,2023-05-01,18:32,X1,ENET,,2200.0,2700.54,PromptPay x2581 นาย สุทัศน์ กลิ่นสวัสดิ์,
4,1/7,2023-05-01,18:34,X2,ENET,2500.0,,200.54,PromptPay x7511 นางสาว ปราณี ดานขุนทด,
5,1/7,2023-05-02,06:40,X1,ENET,,200.0,400.54,กสิกรไทย (KBANK) /X149339,
6,1/7,2023-05-02,09:54,X2,ENET,400.0,,0.54,PromptPay x7511 นางสาว ปราณี ดานขุนทด,
7,1/7,2023-05-02,11:20,C1,CDM,,15000.0,15000.54,3 IN 1 EASYBANK THANON MU,
8,1/7,2023-05-02,11:21,X2,ENET,15000.0,,0.54,PromptPay x7511 นางสาว ปราณี ดานขุนทด,
9,1/7,2023-05-03,18:30,X1,ENET,,700.0,700.54,PromptPay x0990 นางสาว ปราณี ดานขุนทด,


In [8]:
df_headers

Unnamed: 0,page_id,account_name,account_number,period,total_debit,total_credit,total_debit_transaction,total_credit_transaction,address
0,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/07/2023 - 31/07/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
1,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/07/2023 - 31/07/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
2,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/07/2023 - 31/07/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
3,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/07/2023 - 31/07/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
4,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/07/2023 - 31/07/2023,120930.07,119125.98,77.0,28.0,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
5,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/05/2023 - 31/05/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
6,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/05/2023 - 31/05/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
7,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/05/2023 - 31/05/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
8,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/05/2023 - 31/05/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....
9,,นางสาว มลพิทักษ์ สมรฤทธิ์,,01/05/2023 - 31/05/2023,,,,,92/1 มบ.บ้านไทยสุรินทร์ ม.3 ต.เชิงทะเล อ.ถลาจ....


In [1]:
#!/usr/bin/env python3
import os, traceback
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any, Optional
import pdfplumber
from scb_without_note_extractor import SCBStatementExtractor
# — your existing imports & extraction functions here —
# from your_module import extract_header, extract_transactions

INPUT_FOLDER = "/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/SCB/no_note"
PASSWORD: Optional[str] = None

def process_folder(input_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict]]:
    tx_list, hdr_list, failures = [], [], []
    for fn in os.listdir(input_folder):
        if not fn.lower().endswith(".pdf"):
            continue
        path = os.path.join(input_folder, fn)
        try:
            with pdfplumber.open(path, password=PASSWORD) as pdf:
                pages = pdf.pages
                df_tx, df_hdr = SCBStatementExtractor.run(path,None)
                df_hdr["source_file"] = fn
                df_tx ["source_file"] = fn
                tx_list.append(df_tx)
                hdr_list.append(df_hdr)
        except Exception as e:
            failures.append({
                "file": fn,
                "error": str(e),
                "traceback": traceback.format_exc()
            })
    all_tx  = pd.concat(tx_list, ignore_index=True) if tx_list else pd.DataFrame()
    all_hdr = pd.concat(hdr_list, ignore_index=True) if hdr_list else pd.DataFrame()
    return all_tx, all_hdr, failures

def validate_bbl(df_tx_all: pd.DataFrame, df_hdr_all: pd.DataFrame) -> pd.DataFrame:
    # STEP 1: ensure tx columns are numeric
    tx_clean = df_tx_all.copy()
    tx_clean[['debit','credit']] = (
        tx_clean[['debit','credit']]
        .replace('', np.nan)
        .astype(float)
    )

    # STEP 1.5: ensure header columns are numeric
    for col in ['total_debit_transaction',
                'total_debit',
                'total_credit_transaction',
                'total_credit']:
        df_hdr_all[col] = (
            pd.to_numeric(
                df_hdr_all[col].replace('', np.nan).astype(str).str.replace(',', ''),
                errors='coerce'
            )
        )

    # (then proceed with sums/counts as before)
    sums = (
        tx_clean
        .groupby('source_file')[['debit','credit']]
        .sum(min_count=1)
        .rename(columns={'debit':'sum_debit','credit':'sum_credit'})
    )

    counts = (
        tx_clean
        .groupby('source_file')[['debit','credit']]
        .count()
        .rename(columns={'debit':'count_debit_tx','credit':'count_credit_tx'})
    )

    hdr = (
        df_hdr_all
        .dropna(subset=['total_debit_transaction','total_credit_transaction'])
        .groupby('source_file')
        .agg({
            'total_debit_transaction':'sum',
            'total_debit':'sum',
            'total_credit_transaction':'sum',
            'total_credit':'sum'
        })
    ).rename(columns={
        'total_debit_transaction':'total_debit_txns',
        'total_debit':'total_debit',
        'total_credit_transaction':'total_credit_txns',
        'total_credit':'total_credit'
    })

    cmp = (
        hdr
        .join(sums, how='left')
        .join(counts, how='left')
        .reset_index()
        .rename(columns={'source_file':'file'})
    )

    summary = cmp.assign(
        debit_amount_match = lambda d: np.isclose(d['total_debit'], d['sum_debit'], atol=1e-2),
        credit_amount_match = lambda d: np.isclose(d['total_credit'], d['sum_credit'], atol=1e-2),
        transaction_count_debit_match = lambda d: d['total_debit_txns'] == d['count_debit_tx'],
        transaction_count_credit_match = lambda d: d['total_credit_txns'] == d['count_credit_tx']
    )[
        ['file',
         'total_debit','sum_debit','debit_amount_match',
         'total_credit','sum_credit','credit_amount_match',
         'total_credit_txns','total_debit_txns','count_debit_tx','count_credit_tx',
         'transaction_count_debit_match','transaction_count_credit_match']
    ]

    return summary


if __name__ == "__main__":
    df_tx_all, df_hdr_all, failures = process_folder(INPUT_FOLDER)
    print(f"Processed transactions: {df_tx_all.shape}, headers: {df_hdr_all.shape}")
    if failures:
        print(f"\n⚠️ {len(failures)} failures; inspect `failures` list.")
    if df_tx_all.empty or df_hdr_all.empty:
        print("No data to validate; exiting.")
        exit(1)
    print(df_tx_all,df_hdr_all)
    summary = validate_bbl(df_tx_all, df_hdr_all)
    print("\n--- Validation Summary per File ---")
    print(summary.to_string(index=False))

    bad = summary.loc[~(summary.debit_amount_match
                        & summary.credit_amount_match
                        & summary.transaction_count_debit_match
                        & summary.transaction_count_credit_match)]
    if not bad.empty:
        bad = bad.assign(
            diff_debit = bad['sum_debit'] - bad['total_debit'],
            diff_credit    = bad['sum_credit']    - bad['total_credit']
        )
        print("\n❌ Files with mismatches:")
        print(bad.to_string(index=False))
    else:
        print("\n✅ All files validated successfully!")


  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", in

[Page 9] Extraction failed: Bounding box (160, 780.0, 280.1, 795.6) is not fully within parent page bounding box (0.0, 0.0, 612.0, 792.0)
[Page 10] Extraction failed: Bounding box (160, 780.0, 280.1, 795.6) is not fully within parent page bounding box (0.0, 0.0, 612.0, 792.0)
[Page 11] Extraction failed: Bounding box (160, 780.0, 280.1, 795.6) is not fully within parent page bounding box (0.0, 0.0, 612.0, 792.0)


  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", in

[Page 29] Extraction failed: Bounding box (415, 95.9, 550.3, 100.9) is not fully within parent page bounding box (0.0, 0.0, 539.0, 720.0)
[Page 30] Extraction failed: Bounding box (415, 95.9, 550.3, 100.9) is not fully within parent page bounding box (0.0, 0.0, 539.0, 720.0)
[Page 31] Extraction failed: Bounding box (415, 95.9, 550.3, 100.9) is not fully within parent page bounding box (0.0, 0.0, 539.0, 720.0)
[Page 32] Extraction failed: Bounding box (415, 95.9, 550.3, 100.9) is not fully within parent page bounding box (0.0, 0.0, 539.0, 720.0)
[Page 33] Extraction failed: Bounding box (415, 95.9, 550.3, 100.9) is not fully within parent page bounding box (0.0, 0.0, 539.0, 720.0)
[Page 34] Extraction failed: Bounding box (415, 95.9, 550.3, 100.9) is not fully within parent page bounding box (0.0, 0.0, 539.0, 720.0)
[Page 35] Extraction failed: Bounding box (415, 95.9, 550.3, 100.9) is not fully within parent page bounding box (0.0, 0.0, 539.0, 720.0)
[Page 36] Extraction failed: Bound

  header_dataframe.fillna("", inplace=True)
  header_dataframe.fillna("", inplace=True)


Processed transactions: (61011, 11), headers: (2853, 10)
      page_id       date   time code channel    debit   credit   balance  \
0         1/2 2023-07-01  19:42   X2    ENET   1000.0      NaN      0.06   
1         1/2 2023-07-07  23:15   X1    BCMS      NaN    500.0    500.06   
2         1/2 2023-07-07  23:22   X2    ENET    370.0      NaN    130.06   
3         1/2 2023-07-08  19:59   X2    ENET    110.0      NaN     20.06   
4         1/2 2023-07-14  03:32   X2    ENET     20.0      NaN      0.06   
...       ...        ...    ...  ...     ...      ...      ...       ...   
61006     7/7 2023-07-31  16:09   X2    ENET   8990.8      NaN  16625.40   
61007     7/7 2023-07-31  17:23   X2    ENET  13677.0      NaN   2948.40   
61008     7/7 2023-07-31  18:06   X1    ENET      NaN   1000.0   3948.40   
61009     7/7 2023-07-31  19:43   C1    TELL      NaN  27625.0  31573.40   
61010     7/7 2023-07-31  20:11   X2    ENET  15803.0      NaN  15770.40   

                              

  header_dataframe.fillna("", inplace=True)
  df_hdr_all[col].replace('', np.nan).astype(str).str.replace(',', ''),
  df_hdr_all[col].replace('', np.nan).astype(str).str.replace(',', ''),
  df_hdr_all[col].replace('', np.nan).astype(str).str.replace(',', ''),
  df_hdr_all[col].replace('', np.nan).astype(str).str.replace(',', ''),


In [2]:
summary

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match
0,108988-02008969-2566_1_SCB.pdf,131211.53,131211.53,True,138262.69,138262.69,True,19.0,136.0,136,19,True,True
1,108988-02009288-2566_1_SCB.pdf,129724.23,129724.23,True,121901.01,121901.01,True,18.0,124.0,124,18,True,True
2,108988-02009356-2566_1_SCB.pdf,72264.57,72264.57,True,71525.03,71525.03,True,29.0,121.0,121,29,True,True
3,108988-02009511-2566_1_SCB.pdf,155161.69,155161.69,True,140908.77,140908.77,True,76.0,368.0,368,76,True,True
4,108988-02009606-2566_1_SCB.pdf,345698.60,345698.60,True,340892.61,340892.61,True,27.0,327.0,327,27,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
172,108988-02031627-2566_2_SCB.pdf,223350.92,223350.92,True,222782.92,222782.92,True,6.0,22.0,22,6,True,True
173,108988-02031646-2566_1_SCB.pdf,97955.00,97955.00,True,103948.54,103948.54,True,34.0,206.0,206,34,True,True
174,108988-02031653-2566_1_SCB.pdf,296613.07,296613.07,True,286218.70,286218.70,True,64.0,232.0,232,64,True,True
175,มีรูปถ่าย.pdf,1125098.34,1125098.34,True,1125114.05,1125114.05,True,165.0,456.0,456,165,True,True


In [3]:
bad

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match,diff_debit,diff_credit
176,หน้าเบิ้ล.pdf,333380.69,345450.79,False,322472.44,322472.44,True,80.0,290.0,315,80,False,True,12070.1,0.0
