In [1]:
# extractor.py

import re  # Enables regex operations used throughout extraction and cleaning
from typing import Optional, List, Dict, Any  # Provides type hints, helpful for readability and debugging
import pandas as pd  # Data manipulation library—used to build and clean DataFrames
import pdfplumber  # PDF parsing library—extracts text and table‐like structures from PDF pages
import dateutil.parser  # Flexible date parser—fallback when pandas date parsing fails

from config import (
    PDF_PATH,  # Path to the PDF file, imported from config.py
    PASSWORD,  # PDF password (or None) from config.py
    TABLE_CROP_BOX,  # Coordinates for cropping transaction table region
    PAGE_ID_CROP,  # Coordinates for cropping page ID region
    CROPS,  # Dict of header field crop coordinates
    DATE_X_LEFT,  # Left boundary for date tokens
    DATE_X_RIGHT,  # Right boundary for date tokens
    SPLIT_X_CODE_CHANNEL,  # X boundary to split “code/channel” from date
    SPLIT_X_CHANNEL_DEBIT_CREDIT,  # X boundary to split channel from debit/credit
    SPLIT_X_WITHDRAWAL_DEPOSIT,  # X boundary to split withdrawal from deposit
    X_TOLERANCE,  # Fuzzy margin for X‐based splits
    Y_MARGIN,  # Vertical margin for grouping words into rows
    FOOTER_KEYWORDS,  # Keywords to identify footer summary lines
    MONEY_PATTERN,  # Regex to match money values
    DATE_PATTERN,  # Regex to match date formats
    TIME_PATTERN,  # Regex to match time formats
    ACCOUNT_NUMBER_PATTERN,  # Regex to match 9–12 digit account numbers
    PERIOD_PATTERN,  # Regex to match statement period range
)

PDF_PATH: str = "/Users/if658228/Downloads/GSB/108988-02364974-2566_1_GSB.pdf"


class PDFStatementExtractor:
    """
    Encapsulates all logic to extract header and transaction data from a PDF bank statement.
    """

    def __init__(self, pdf_path: str = PDF_PATH, password: Optional[str] = PASSWORD):
        self.pdf_path = pdf_path  # Stores the path to the PDF file
        self.password = password  # Stores the password (if any), otherwise None

    def _clean_page_id(self, raw_text: str) -> str:
        """
        Normalize a raw page-ID string (e.g., ' 1 / 10 ') to '1/10'; return empty if pattern fails.
        """
        numeric_parts = re.findall(r"\d+", raw_text)  # Find all groups of digits
        if len(numeric_parts) >= 2:
            candidate = f"{numeric_parts[0]}/{numeric_parts[1]}"  # Format “current/total”
            if re.fullmatch(r"\d+/\d+", candidate):  # Validate correct pattern
                return candidate  # Return normalized page ID
        return ""  # Return empty string if extraction fails

    def _extract_account_number_and_period(self, full_text: str) -> tuple[str, str]:
        """
        Extract account number (9–12 digits) and statement period (“dd/mm/yyyy - dd/mm/yyyy”) 
        from the entire page text using regex patterns.
        """
        account_match = ACCOUNT_NUMBER_PATTERN.search(full_text)  # Search for account number
        period_match = PERIOD_PATTERN.search(full_text)  # Search for period range
        account_number = account_match.group() if account_match else ""  # Get number or blank
        period_as_string = period_match.group() if period_match else ""  # Get period or blank
        return account_number, period_as_string  # Return tuple of strings

    def _convert_be_to_ad(self, date_string: str) -> str:
        """
        Convert a date from Buddhist Era (BE) (year > 2400) to Gregorian (AD), 
        e.g., “01/01/2567” → “01/01/2024”. If no BE year, return unchanged.
        """
        match = re.match(r"^(\d{2})/(\d{2})/(\d{4})$", date_string)  # Match dd/mm/yyyy
        if match:
            day, month, year_str = match.groups()
            year = int(year_str)
            if year > 2400:  # If it’s likely BE, subtract 543 to get AD
                year -= 543
            return f"{day}/{month}/{year}"  # Return converted date string
        return date_string  # Return original if not matching pattern

    def _find_date_in_text(self, text: str) -> str:
        """
        If the text begins with a valid date (dd/mm/yy or dd/mm/yyyy), return that date; 
        otherwise return an empty string.
        """
        match = re.match(r"^(\d{2}/\d{2}/(\d{4}|\d{4}))", text)  # Look for date at start
        return match.group(1) if match else ""  # Return matched date or blank

    def _find_time_after_date_word(
        self, date_word: dict[str, Any], all_words: List[dict[str, Any]]
    ) -> str:
        """
        Given the date word dictionary and all words on the page, 
        find a time (hh:mm) token that appears within 20 points below the date.
        """
        date_top = date_word.get("top", 0) if date_word else 0  # Vertical position of date
        for word in all_words:
            if TIME_PATTERN.match(word["text"]) and 0 < (word["top"] - date_top) <= 20:
                return word["text"]  # Return first matching time token
        return ""  # Return blank if no time found

    def _extract_page_id(self, page: pdfplumber.page.Page) -> str:
        """
        Crop the page’s designated page‐ID area and normalize it using _clean_page_id().
        """
        raw_crop_text = page.crop(PAGE_ID_CROP).extract_text() or ""  # Extract raw text from that region
        return self._clean_page_id(raw_crop_text.strip())  # Clean and normalize

    def extract_transactions_from_pages(self, pages: List[pdfplumber.page.Page]) -> List[Dict[str, Any]]:
        """
        For each page, crop to the transaction‐table region, group words into rows based on date positions,
        and assemble structured transaction records.
        """
        transaction_records: List[Dict[str, Any]] = []  # Initialize list to hold all parsed rows

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = self._extract_page_id(page)  # Extract page ID for this page
                table_region = page.crop(TABLE_CROP_BOX)  # Crop the region where transactions sit
                all_words = table_region.extract_words(use_text_flow=False)  
                # Get individual word dictionaries (x0, top, text, etc.)

                # Identify vertical positions (“top”) for each word that starts with a date
                date_tops = sorted(
                    w["top"] for w in all_words if self._find_date_in_text(w["text"])
                )
                if not date_tops:
                    continue  # No dates → likely no transactions on this page

                # Build intervals to group words into rows
                row_intervals: List[tuple[float, float]] = []
                for idx, top_y in enumerate(date_tops):
                    start_y = top_y - Y_MARGIN  # Slightly above the date top
                    end_y = (
                        date_tops[idx + 1] - Y_MARGIN if idx + 1 < len(date_tops) else top_y + 15
                    )
                    row_intervals.append((start_y, end_y))  # Append the interval

                # Create an empty list for each interval to hold words for that row
                rows: List[List[dict[str, Any]]] = [[] for _ in row_intervals]
                for word in all_words:
                    for row_idx, (start_y, end_y) in enumerate(row_intervals):
                        if start_y <= word["top"] < end_y:
                            rows[row_idx].append(word)  # Assign word to its row group
                            break

                for row_words in rows:
                    if not row_words:
                        continue  # Skip empty row groups

                    # Sort by vertical position then horizontal (left-to-right)
                    sorted_row = sorted(row_words, key=lambda w: (w["top"], w["x0"]))
                    first_word = next((w for w in sorted_row if w["text"].strip()), None)
                    if not first_word or not self._find_date_in_text(first_word["text"]):
                        continue  # Skip rows not starting with a valid date

                    # Extract date text, convert from BE to AD if needed
                    date_word = first_word
                    date_str_raw = self._find_date_in_text(date_word["text"])
                    date_str_converted = self._convert_be_to_ad(date_str_raw)

                    # Parse to pandas.Timestamp or fallback to dateutil
                    try:
                        date_value = pd.to_datetime(date_str_converted, dayfirst=True, errors="raise")
                    except Exception:
                        try:
                            date_value = dateutil.parser.parse(date_str_converted, dayfirst=True, fuzzy=True)
                        except Exception:
                            date_value = pd.NaT  # If parsing fails, set as Not‐a‐Time

                    # If the date token had trailing characters, collect the remainder
                    date_remainder = ""
                    if date_word and date_str_raw:
                        date_length = len(date_str_raw)
                        date_remainder = date_word["text"][date_length:]

                    # Build a “cleaned” list of words for this row: remove main date word, append remainder if any
                    cleaned_row_words: List[dict[str, Any]] = []
                    for word in sorted_row:
                        if word is date_word and date_remainder:
                            # Insert a new word dict representing the leftover text after the date
                            cleaned_row_words.append({
                                "text": date_remainder,
                                "x0": word["x0"],
                                "x1": word["x1"],
                                "top": word["top"]
                            })
                            continue
                        if word is date_word:
                            continue  # Skip the original date word entirely
                        cleaned_row_words.append(word)  # Keep all other tokens

                    # Initialize containers for different types of tokens
                    code_tokens: List[str] = []
                    channel_tokens: List[str] = []
                    description_tokens: List[str] = []

                    # First pass: collect non‐money tokens into code/channel/description
                    for word in cleaned_row_words:
                        text_value = word["text"]
                        x0 = word["x0"]
                        if not text_value.strip():
                            continue  # Skip empty strings
                        if TIME_PATTERN.match(text_value):
                            continue  # Skip time tokens explicitly
                        if MONEY_PATTERN.match(text_value):
                            continue  # Skip money tokens for now
                        # Otherwise, decide if it’s part of code/channel or description
                        if x0 <= SPLIT_X_CODE_CHANNEL + X_TOLERANCE:
                            code_tokens.append(text_value)
                        elif x0 <= SPLIT_X_CHANNEL_DEBIT_CREDIT + X_TOLERANCE:
                            channel_tokens.append(text_value)
                        else:
                            description_tokens.append(text_value)

                    # Combine code and channel tokens into a single string, split once at “/”
                    full_code_channel = "/".join(code_tokens + channel_tokens)
                    parts = full_code_channel.split("/", 1)
                    code_value = parts[0]
                    channel_value = parts[1] if len(parts) > 1 else ""

                    # === NEW: collect all money tokens (withdrawal, deposit, balance) in one list ===
                    money_words: List[Dict[str, Any]] = []
                    for word in cleaned_row_words:
                        text_value = word["text"]
                        if MONEY_PATTERN.match(text_value):
                            raw = text_value.replace(",", "").strip()   # e.g. "(995,358.30)" → "(995358.30)"
                            is_negative = False

                            # 1) If it’s in parentheses, treat it as negative
                            if raw.startswith("(") and raw.endswith(")"):
                                is_negative = True
                                raw = raw[1:-1]  # strip the "(" and ")"

                            # 2) If it already has a leading “–”, let float(...) handle it
                            #    (so raw might be "-995358.30" by now).

                            try:
                                val = float(raw)
                                if is_negative:
                                    val = -val
                            except ValueError:
                                # couldn’t parse it as a float
                                val = None

                            if val is not None:
                                money_words.append({
                                    "value": val,
                                    "x1": word["x1"]
                                })


                    # Sort all money tokens by x1 ascending (i.e., left to right)
                    money_words_sorted = sorted(money_words, key=lambda w: w["x1"])

                    # Initialize amounts
                    withdrawal_amount: float | None = None
                    deposit_amount: float | None = None
                    balance_value: float | None = None

                    # Assign withdrawal / deposit / balance based on position rules
                    if len(money_words_sorted) == 1:
                        # Only one token: decide withdrawal vs deposit by x1 threshold
                        only = money_words_sorted[0]
                        if only["x1"] <= SPLIT_X_WITHDRAWAL_DEPOSIT + X_TOLERANCE:
                            withdrawal_amount = only["value"]
                        else:
                            deposit_amount = only["value"]
                    elif len(money_words_sorted) >= 2:
                        # Check if leftmost is withdrawal
                        first = money_words_sorted[0]
                        if first["x1"] <= SPLIT_X_WITHDRAWAL_DEPOSIT + X_TOLERANCE:
                            withdrawal_amount = first["value"]
                            # If exactly two tokens remain, that second is the balance
                            if len(money_words_sorted) == 2:
                                balance_value = money_words_sorted[1]["value"]
                            else:
                                # Three or more: second is deposit, third is balance
                                deposit_amount = money_words_sorted[1]["value"]
                                if len(money_words_sorted) >= 3:
                                    balance_value = money_words_sorted[2]["value"]
                        else:
                            # Leftmost is not a withdrawal → treat as deposit
                            deposit_amount = first["value"]
                            if len(money_words_sorted) == 2:
                                balance_value = money_words_sorted[1]["value"]
                            else:
                                # If 3 or more and leftmost was not withdrawal (rare but safe):
                                # second is treated as balance
                                balance_value = money_words_sorted[1]["value"]

                    # === End of new money‐assignment logic ===

                    record = {
                        "page_id": page_id,  # Which page this record came from
                        "date": date_value,  # Parsed pandas.Timestamp or NaT
                        "time": self._find_time_after_date_word(date_word, all_words) if date_word else "",
                        "code": "",
                        "channel": "",
                        "withdrawal": withdrawal_amount,
                        "deposit": deposit_amount,
                        "balance": balance_value,
                        "description": code_value+" "+channel_value  # Full description text
                    }

                    if page_id == "":
                        # If page_id extraction failed, blank out all fields to avoid confusion
                        record = {key: "" for key in record}

                    transaction_records.append(record)  # Append the completed record to the list

            except Exception as extraction_error:
                # If any error occurs in parsing this page’s transactions, skip it with a warning
                print(f"⚠️ Skipping page {page_index} in transaction extraction due to error: {extraction_error}")
                continue

        return transaction_records  # Return list of all parsed transaction dicts

    def extract_headers_from_pages(self, pages: List[pdfplumber.page.Page]) -> List[Dict[str, Any]]:
        """
        Iterate through each page and extract header information (account number, period, 
        account_name crop, plus footer totals if present).
        """
        header_rows: List[Dict[str, Any]] = []  # Initialize list to hold header dicts

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = self._extract_page_id(page)  # Extract page ID
                full_text = page.extract_text() or ""  # Get full-page text as a fallback
                has_footer_summary = any(
                    keyword.lower() in full_text.lower() for keyword in FOOTER_KEYWORDS
                )  # Detect if footer lines exist

                account_number, period_as_string = self._extract_account_number_and_period(full_text)
                # Regex-extracted account number and statement period

                header_data: Dict[str, Any] = {
                    "page_id": page_id,  # Add normalized page ID
                    "account_number": account_number,  # Add extracted account number
                    "period": period_as_string,  # Add extracted date range
                }

                # Crop each configured header field (e.g., “account_name”)
                for field_name, bounding_box in CROPS.items():
                    raw_text = page.crop(bounding_box).extract_text() or ""
                    header_data[field_name] = raw_text.strip().replace("\n", " ")
                    # Store cleaned text (no newlines) under that field’s key

                if has_footer_summary:
                    for line in full_text.splitlines():
                        if line.startswith("ยอดรวมรายการถอน") or line.startswith("Total Withdrawal"):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_data.update({
                                "total_items_debit": numbers[0].replace(",", "") if len(numbers) > 0 else None,
                                "total_amount_debit": numbers[1].replace(",", "") if len(numbers) > 1 else None
                            })
                        elif line.startswith("ยอดรวมรายการฝาก") or line.startswith("Total Deposit"):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_data.update({
                                "total_items_credit": numbers[0].replace(",", "") if len(numbers) > 0 else None,
                                "total_amount_credit": numbers[1].replace(",", "") if len(numbers) > 1 else None
                            })
                        # Adds debit/credit totals if those lines are found in footer

                if page_id == "":
                    # If page_id extraction failed, blank out all header fields
                    header_data = {key: "" for key in header_data}

                header_rows.append(header_data)  # Append the header dict for this page

            except Exception as header_error:
                # Skip this page if any error in header extraction
                print(f"⚠️ Skipping page {page_index} in header extraction due to error: {header_error}")
                continue

        return header_rows  # Return list of header records

    def _clean_float_column(self, series: pd.Series) -> pd.Series:
        """
        Given a pandas Series of strings like '-1,234.56', '(1,234.56)' or '1234.56',
        strip out commas and parentheses, but keep a leading minus, then convert to float.
        """
        def parse_money(s: str) -> float:
            s = str(s).strip()
            if not s:
                return float("nan")
            is_neg = False

            # Handle parentheses‐style negative: "(1,234.56)" → "-1234.56"
            if s.startswith("(") and s.endswith(")"):
                is_neg = True
                s = s[1:-1]

            # Remove everything except digits, decimal point, and minus‐sign
            cleaned = re.sub(r"[^0-9\.\-]", "", s)
            try:
                num = float(cleaned)
            except ValueError:
                return float("nan")
            return -num if is_neg else num

        return series.astype(str).apply(parse_money)



    def clean_dataframes(
        self,
        transactions_df: pd.DataFrame,
        headers_df: pd.DataFrame
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Rename, filter, and convert columns for both header and transaction DataFrames.
        Uses .copy() after any slicing to prevent SettingWithCopyWarning.
        """
        # Rename header columns for consistency
        headers_df = headers_df.rename(columns={
            "total_amount_debit": "total_debit",
            "total_amount_credit": "total_credit",
            "total_items_debit": "total_debit_transaction",
            "total_items_credit": "total_credit_transaction"
        })

        # Fill NaN with empty strings and ensure we’re working on a copy
        headers_df = headers_df.fillna("").copy()
        headers_df["address"] = ""  # Add a blank “address” column for consistency
        # Keep only rows where page_id begins with a digit (valid page)
        headers_df = headers_df[headers_df["page_id"].str.match(r"^\d", na=False)].copy().reset_index(drop=True)

        # Prepare transactions DataFrame: rename and fill NaN
        transactions_df = transactions_df.rename(columns={"withdrawal": "debit", "deposit": "credit"})
        transactions_df = transactions_df[~(transactions_df['balance'].isnull())&(~(transactions_df['debit'].isnull())|~(transactions_df['credit'].isnull()))]
        transactions_df = transactions_df.fillna("").copy()
        transactions_df["transaction_type"] = ""  # Placeholder column for “transaction_type”
        
        # Convert “debit”, “credit”, and “balance” columns to floats
        
        for col_name in ["debit", "credit", "balance"]:
            if col_name in transactions_df.columns:
                transactions_df[col_name] = self._clean_float_column(transactions_df[col_name])

        #transactions_df.fillna({'debit': 0, 'credit': 0}, inplace=True)


        # Convert numeric columns in headers to float
        for col_name in [
            "total_debit", "total_credit",
            "total_debit_transaction", "total_credit_transaction"
        ]:
            if col_name in headers_df.columns:
                headers_df[col_name] = self._clean_float_column(headers_df[col_name])
        return headers_df, transactions_df  # Return cleaned DataFrames

    def run(self) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Main entry point: open the PDF, extract raw header and transaction records,
        convert to DataFrames, clean them, print summaries, and return both DataFrames.
        """
        with pdfplumber.open(self.pdf_path, password=self.password) as pdf_handle:
            pages = pdf_handle.pages  # List of all Page objects
            raw_transaction_records = self.extract_transactions_from_pages(pages)
            raw_header_records = self.extract_headers_from_pages(pages)

        # Convert lists of dicts into pandas DataFrames
        transaction_dataframe = pd.DataFrame(raw_transaction_records)
        header_dataframe = pd.DataFrame(raw_header_records)

        # Clean and normalize the DataFrames
        cleaned_header_df, cleaned_transaction_df = self.clean_dataframes(
            transaction_dataframe, header_dataframe
        )

        # Print header DataFrame (all rows) and last 10 transactions for a quick check
        print("=== Header DataFrame ===")
        print(cleaned_header_df.to_string(index=False))
        print("\n=== Last 10 Transactions ===")
        print(cleaned_transaction_df.tail(10).to_string(index=False))

        return cleaned_header_df, cleaned_transaction_df  # Return cleaned results


if __name__ == "__main__":
    extractor = PDFStatementExtractor()  # Instantiate the extractor with default config
    header_df, transactions_df = extractor.run()  # Run extraction and cleaning



ImportError: cannot import name 'PDF_PATH' from 'config' (/Users/if658228/Desktop/ktb/pdf-statement-extraction/gsb/config.py)

In [None]:
import re  # Enables regex operations used throughout extraction and cleaning
from typing import Optional, List, Dict, Any  # Provides type hints
import pandas as pd  # Data manipulation library—used to build and clean DataFrames
import pdfplumber  # PDF parsing library—extracts text and table‐like structures from PDF pages
import dateutil.parser  # Flexible date parser—fallback when pandas date parsing fails

import config  # Import the entire config module

class GSBStatementExtractor:
    """
    Encapsulates all logic to extract header and transaction data from a PDF bank statement.
    """

    @staticmethod
    def _clean_page_id(raw_text: str) -> str:
        """
        Normalize a raw page-ID string (e.g., ' 1 / 10 ') to '1/10'; return empty if pattern fails.
        """
        numeric_parts = re.findall(r"\d+", raw_text)
        if len(numeric_parts) >= 2:
            candidate = f"{numeric_parts[0]}/{numeric_parts[1]}"
            if re.fullmatch(r"\d+/\d+", candidate):
                return candidate
        return ""

    @staticmethod
    def _extract_account_number_and_period(full_text: str) -> tuple[str, str]:
        """
        Extract account number (9–12 digits) and statement period (“dd/mm/yyyy - dd/mm/yyyy”)
        from the entire page text using regex patterns.
        """
        account_match = config.ACCOUNT_NUMBER_PATTERN.search(full_text)
        period_match = config.PERIOD_PATTERN.search(full_text)
        account_number = account_match.group() if account_match else ""
        period_as_string = period_match.group() if period_match else ""
        return account_number, period_as_string

    @staticmethod
    def _convert_be_to_ad(date_string: str) -> str:
        """
        Convert a date from Buddhist Era (BE) (year > 2400) to Gregorian (AD),
        e.g., “01/01/2567” → “01/01/2024”. If no BE year, return unchanged.
        """
        match = re.match(r"^(\d{2})/(\d{2})/(\d{4})$", date_string)
        if match:
            day, month, year_str = match.groups()
            year = int(year_str)
            if year > 2400:
                year -= 543
            return f"{day}/{month}/{year}"
        return date_string

    @staticmethod
    def _find_date_in_text(text: str) -> str:
        """
        If the text begins with a valid date (dd/mm/yy or dd/mm/yyyy), return that date;
        otherwise return an empty string.
        """
        match = re.match(r"^(\d{2}/\d{2}/(\d{4}|\d{4}))", text)
        return match.group(1) if match else ""

    @staticmethod
    def _find_time_after_date_word(
        date_word: dict[str, Any], all_words: List[dict[str, Any]]
    ) -> str:
        """
        Given the date word dictionary and all words on the page,
        find a time (hh:mm) token that appears within 20 points below the date.
        """
        date_top = date_word.get("top", 0) if date_word else 0
        for word in all_words:
            if config.TIME_PATTERN.match(word["text"]) and 0 < (word["top"] - date_top) <= 20:
                return word["text"]
        return ""

    @staticmethod
    def _extract_page_id(page: pdfplumber.page.Page) -> str:
        """
        Crop the page’s designated page‐ID area and normalize it using _clean_page_id().
        """
        raw_crop_text = page.crop(config.PAGE_ID_CROP).extract_text() or ""
        return PDFStatementExtractor._clean_page_id(raw_crop_text.strip())

    @staticmethod
    def extract_transactions(pages: List[pdfplumber.page.Page]) -> List[Dict[str, Any]]:
        """
        For each page, crop to the transaction‐table region, group words into rows based on date positions,
        and assemble structured transaction records.
        """
        transaction_records: List[Dict[str, Any]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = PDFStatementExtractor._extract_page_id(page)
                table_region = page.crop(config.TABLE_CROP_BOX)
                all_words = table_region.extract_words(use_text_flow=False)

                date_tops = sorted(
                    w["top"] for w in all_words if PDFStatementExtractor._find_date_in_text(w["text"])
                )
                if not date_tops:
                    continue

                row_intervals: List[tuple[float, float]] = []
                for idx, top_y in enumerate(date_tops):
                    start_y = top_y - config.Y_MARGIN
                    end_y = (
                        date_tops[idx + 1] - config.Y_MARGIN if idx + 1 < len(date_tops) else top_y + 15
                    )
                    row_intervals.append((start_y, end_y))

                rows: List[List[dict[str, Any]]] = [[] for _ in row_intervals]
                for word in all_words:
                    for row_idx, (start_y, end_y) in enumerate(row_intervals):
                        if start_y <= word["top"] < end_y:
                            rows[row_idx].append(word)
                            break

                for row_words in rows:
                    if not row_words:
                        continue

                    sorted_row = sorted(row_words, key=lambda w: (w["top"], w["x0"]))
                    first_word = next((w for w in sorted_row if w["text"].strip()), None)
                    if not first_word or not PDFStatementExtractor._find_date_in_text(first_word["text"]):
                        continue

                    date_word = first_word
                    date_str_raw = PDFStatementExtractor._find_date_in_text(date_word["text"])
                    date_str_converted = PDFStatementExtractor._convert_be_to_ad(date_str_raw)

                    try:
                        date_value = pd.to_datetime(date_str_converted, dayfirst=True, errors="raise")
                    except Exception:
                        try:
                            date_value = dateutil.parser.parse(date_str_converted, dayfirst=True, fuzzy=True)
                        except Exception:
                            date_value = pd.NaT

                    date_remainder = ""
                    if date_word and date_str_raw:
                        date_length = len(date_str_raw)
                        date_remainder = date_word["text"][date_length:]

                    cleaned_row_words: List[dict[str, Any]] = []
                    for word in sorted_row:
                        if word is date_word and date_remainder:
                            cleaned_row_words.append({
                                "text": date_remainder,
                                "x0": word["x0"],
                                "x1": word["x1"],
                                "top": word["top"]
                            })
                            continue
                        if word is date_word:
                            continue
                        cleaned_row_words.append(word)

                    code_tokens: List[str] = []
                    channel_tokens: List[str] = []
                    description_tokens: List[str] = []

                    for word in cleaned_row_words:
                        text_value = word["text"]
                        x0 = word["x0"]
                        if not text_value.strip():
                            continue
                        if config.TIME_PATTERN.match(text_value):
                            continue
                        if config.MONEY_PATTERN.match(text_value):
                            continue
                        if x0 <= config.SPLIT_X_CODE_CHANNEL + config.X_TOLERANCE:
                            code_tokens.append(text_value)
                        elif x0 <= config.SPLIT_X_CHANNEL_DEBIT_CREDIT + config.X_TOLERANCE:
                            channel_tokens.append(text_value)
                        else:
                            description_tokens.append(text_value)

                    full_code_channel = "/".join(code_tokens + channel_tokens)
                    parts = full_code_channel.split("/", 1)
                    code_value = parts[0]
                    channel_value = parts[1] if len(parts) > 1 else ""

                    money_words: List[Dict[str, Any]] = []
                    for word in cleaned_row_words:
                        text_value = word["text"]
                        if config.MONEY_PATTERN.match(text_value):
                            raw = text_value.replace(",", "").strip()
                            is_negative = False

                            if raw.startswith("(") and raw.endswith(")"):
                                is_negative = True
                                raw = raw[1:-1]

                            try:
                                val = float(raw)
                                if is_negative:
                                    val = -val
                            except ValueError:
                                val = None

                            if val is not None:
                                money_words.append({
                                    "value": val,
                                    "x1": word["x1"]
                                })

                    money_words_sorted = sorted(money_words, key=lambda w: w["x1"])

                    withdrawal_amount: float | None = None
                    deposit_amount: float | None = None
                    balance_value: float | None = None

                    if len(money_words_sorted) == 1:
                        only = money_words_sorted[0]
                        if only["x1"] <= config.SPLIT_X_WITHDRAWAL_DEPOSIT + config.X_TOLERANCE:
                            withdrawal_amount = only["value"]
                        else:
                            deposit_amount = only["value"]
                    elif len(money_words_sorted) >= 2:
                        first = money_words_sorted[0]
                        if first["x1"] <= config.SPLIT_X_WITHDRAWAL_DEPOSIT + config.X_TOLERANCE:
                            withdrawal_amount = first["value"]
                            if len(money_words_sorted) == 2:
                                balance_value = money_words_sorted[1]["value"]
                            else:
                                deposit_amount = money_words_sorted[1]["value"]
                                if len(money_words_sorted) >= 3:
                                    balance_value = money_words_sorted[2]["value"]
                        else:
                            deposit_amount = first["value"]
                            if len(money_words_sorted) == 2:
                                balance_value = money_words_sorted[1]["value"]
                            else:
                                balance_value = money_words_sorted[1]["value"]

                    record = {
                        "page_id": page_id,
                        "date": date_value,
                        "time": PDFStatementExtractor._find_time_after_date_word(date_word, all_words) if date_word else "",
                        "code": code_value,
                        "channel": channel_value,
                        "withdrawal": withdrawal_amount,
                        "deposit": deposit_amount,
                        "balance": balance_value,
                        "description": code_value + " " + channel_value
                    }

                    if page_id == "":
                        record = {key: "" for key in record}

                    transaction_records.append(record)

            except Exception as extraction_error:
                print(f"⚠️ Skipping page {page_index} in transaction extraction due to error: {extraction_error}")
                continue

        return transaction_records

    @staticmethod
    def extract_headers(pages: List[pdfplumber.page.Page]) -> List[Dict[str, Any]]:
        """
        Iterate through each page and extract header information (account number, period,
        account_name crop, plus footer totals if present).
        """
        header_rows: List[Dict[str, Any]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = PDFStatementExtractor._extract_page_id(page)
                full_text = page.extract_text() or ""
                has_footer_summary = any(
                    keyword.lower() in full_text.lower() for keyword in config.FOOTER_KEYWORDS
                )

                account_number, period_as_string = PDFStatementExtractor._extract_account_number_and_period(full_text)

                header_data: Dict[str, Any] = {
                    "page_id": page_id,
                    "account_number": account_number,
                    "period": period_as_string,
                }

                for field_name, bounding_box in config.CROPS.items():
                    raw_text = page.crop(bounding_box).extract_text() or ""
                    header_data[field_name] = raw_text.strip().replace("\n", " ")

                if has_footer_summary:
                    for line in full_text.splitlines():
                        if line.startswith("ยอดรวมรายการถอน") or line.startswith("Total Withdrawal"):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_data.update({
                                "total_items_debit": numbers[0].replace(",", "") if len(numbers) > 0 else None,
                                "total_amount_debit": numbers[1].replace(",", "") if len(numbers) > 1 else None
                            })
                        elif line.startswith("ยอดรวมรายการฝาก") or line.startswith("Total Deposit"):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_data.update({
                                "total_items_credit": numbers[0].replace(",", "") if len(numbers) > 0 else None,
                                "total_amount_credit": numbers[1].replace(",", "") if len(numbers) > 1 else None
                            })

                if page_id == "":
                    header_data = {key: "" for key in header_data}

                header_rows.append(header_data)

            except Exception as header_error:
                print(f"⚠️ Skipping page {page_index} in header extraction due to error: {header_error}")
                continue

        return header_rows

    @staticmethod
    def _clean_float_column(series: pd.Series) -> pd.Series:
        """
        Given a pandas Series of strings like '-1,234.56', '(1,234.56)' or '1234.56',
        strip out commas and parentheses, but keep a leading minus, then convert to float.
        """
        def parse_money(s: str) -> float:
            s = str(s).strip()
            if not s:
                return float("nan")
            is_neg = False

            if s.startswith("(") and s.endswith(")"):
                is_neg = True
                s = s[1:-1]

            cleaned = re.sub(r"[^0-9\.\-]", "", s)
            try:
                num = float(cleaned)
            except ValueError:
                return float("nan")
            return -num if is_neg else num

        return series.astype(str).apply(parse_money)

    @staticmethod
    def clean_dataframes(
        transactions_df: pd.DataFrame,
        headers_df: pd.DataFrame
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Rename, filter, and convert columns for both header and transaction DataFrames.
        Uses .copy() after any slicing to prevent SettingWithCopyWarning.
        """
        headers_df = headers_df.rename(columns={
            "total_amount_debit": "total_debit",
            "total_amount_credit": "total_credit",
            "total_items_debit": "total_debit_transaction",
            "total_items_credit": "total_credit_transaction"
        })

        headers_df = headers_df.fillna("").copy()
        headers_df["address"] = ""
        headers_df = headers_df[headers_df["page_id"].str.match(r"^\d", na=False)].copy().reset_index(drop=True)

        transactions_df = transactions_df.rename(columns={"withdrawal": "debit", "deposit": "credit"})
        transactions_df = transactions_df[~(transactions_df['balance'].isnull()) & (~(transactions_df['debit'].isnull()) | ~(transactions_df['credit'].isnull()))]
        transactions_df = transactions_df.fillna("").copy()
        transactions_df["transaction_type"] = ""
        
        for col_name in ["debit", "credit", "balance"]:
            if col_name in transactions_df.columns:
                transactions_df[col_name] = PDFStatementExtractor._clean_float_column(transactions_df[col_name])

        for col_name in [
            "total_debit", "total_credit",
            "total_debit_transaction", "total_credit_transaction"
        ]:
            if col_name in headers_df.columns:
                headers_df[col_name] = PDFStatementExtractor._clean_float_column(headers_df[col_name])

        return headers_df, transactions_df

    def run(
        self,
        pdf_path: str,
        password: Optional[str]
    ) -> tuple[pd.DataFrame, pd.DataFrame]:
        """
        Main entry point: open the PDF at the given path with the given password,
        extract raw header and transaction records, convert to DataFrames, clean them,
        print summaries, and return both DataFrames.
        """
        with pdfplumber.open(pdf_path, password=password) as pdf_handle:
            pages = pdf_handle.pages
            raw_transaction_records = PDFStatementExtractor.extract_transactions(pages)
            raw_header_records = PDFStatementExtractor.extract_headers(pages)

        transaction_dataframe = pd.DataFrame(raw_transaction_records)
        header_dataframe = pd.DataFrame(raw_header_records)

        cleaned_header_df, cleaned_transaction_df = PDFStatementExtractor.clean_dataframes(
            transaction_dataframe, header_dataframe
        )

        print("=== Header DataFrame ===")
        print(cleaned_header_df.to_string(index=False))
        print("\n=== Last 10 Transactions ===")
        print(cleaned_transaction_df.tail(10).to_string(index=False))

        return cleaned_header_df, cleaned_transaction_df





In [None]:
header_df

Unnamed: 0,page_id,account_number,period,account_name,total_debit_transaction,total_debit,total_credit_transaction,total_credit,address
0,1/1,20313149807,01/12/2566 - 01/12/2566,นาย สันติ์ สาคร,0.0,0.0,0.0,0.0,
1,1/6,20313149807,01/11/2566 - 30/11/2566,นาย สันติ์ สาคร,,,,,
2,2/6,20313149807,01/11/2566 - 30/11/2566,นาย สันติ์ สาคร,,,,,
3,3/6,20313149807,01/11/2566 - 30/11/2566,นาย สันติ์ สาคร,,,,,
4,4/6,20313149807,01/11/2566 - 30/11/2566,นาย สันติ์ สาคร,,,,,
5,5/6,20313149807,01/11/2566 - 30/11/2566,นาย สันติ์ สาคร,,,,,
6,6/6,20313149807,01/11/2566 - 30/11/2566,นาย สันติ์ สาคร,89.0,256206.0,39.0,244977.0,
7,1/4,20313149807,01/10/2566 - 31/10/2566,นาย สันติ์ สาคร,,,,,
8,2/4,20313149807,01/10/2566 - 31/10/2566,นาย สันติ์ สาคร,,,,,
9,3/4,20313149807,01/10/2566 - 31/10/2566,นาย สันติ์ สาคร,,,,,


In [None]:
transactions_df

Unnamed: 0,page_id,date,time,code,channel,debit,credit,balance,description,transaction_type
1,1/6,2023-11-01,,,,6250.0,,17091.95,MyMo Transfer/from/SAV,
2,1/6,2023-11-01,,,,7000.0,,10091.95,MyMo ATM/CASH/WITHDRAWAL,
3,1/6,2023-11-01,,,,,15000.0,25091.95,Transfer SAV/Deposit,
4,1/6,2023-11-02,,,,,3000.0,28091.95,Transfer SAV/Deposit,
5,1/6,2023-11-02,,,,3000.0,,25091.95,MyMo Transfer/from/SAV,
...,...,...,...,...,...,...,...,...,...,...
220,4/4,2023-10-31,,,,4200.0,,253.95,MyMo Transfer/from/SAV,
221,4/4,2023-10-31,,,,,46788.0,47041.95,Transfer SAV/Deposit,
222,4/4,2023-10-31,,,,6200.0,,40841.95,MyMo Transfer/from/SAV,
223,4/4,2023-10-31,,,,7500.0,,33341.95,MyMo Transfer/from/SAV,


In [1]:
#!/usr/bin/env python3
import os, traceback
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any, Optional
import pdfplumber
from gsb_extractor import GSBStatementExtractor

# — your existing imports & extraction functions here —
# from your_module import extract_header, extract_transactions

INPUT_FOLDER = "/Users/if658228/Downloads/GSB"
PASSWORD: Optional[str] = None

def process_folder(input_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict]]:
    tx_list, hdr_list, failures = [], [], []
    for fn in os.listdir(input_folder):
        if not fn.lower().endswith(".pdf"):
            continue
        path = os.path.join(input_folder, fn)
        try:
            with pdfplumber.open(path, password=PASSWORD) as pdf:
                pages = pdf.pages

        # Clean and normalize the DataFrames
                df_hdr, df_tx = GSBStatementExtractor().run(path,None)
                df_hdr["source_file"] = fn
                df_tx ["source_file"] = fn
                tx_list.append(df_tx)
                hdr_list.append(df_hdr)
        except Exception as e:
            failures.append({
                "file": fn,
                "error": str(e),
                "traceback": traceback.format_exc()
            })
    all_tx  = pd.concat(tx_list, ignore_index=True) if tx_list else pd.DataFrame()
    all_hdr = pd.concat(hdr_list, ignore_index=True) if hdr_list else pd.DataFrame()
    return all_tx, all_hdr, failures

def validate_bbl(df_tx_all: pd.DataFrame, df_hdr_all: pd.DataFrame) -> pd.DataFrame:
    # STEP 1: ensure numeric
    tx_clean = df_tx_all.copy()
    tx_clean[['debit','credit']] = (
        tx_clean[['debit','credit']]
        .replace('', np.nan)
        .astype(float)
    )

    # STEP 2: sums per file
    sums = (tx_clean
            .groupby('source_file')[['debit','credit']]
            .sum(min_count=1)
            .rename(columns={
                'debit':'sum_debit',
                'credit'   :'sum_credit'
            }))

    # STEP 3: counts per file
    counts = (tx_clean
              .groupby('source_file')[['debit','credit']]
              .count()
              .rename(columns={
                  'debit':'count_debit_tx',
                  'credit'   :'count_credit_tx'
              }))

    # STEP 4: pick header summary columns (drop pages without both)
    # note: we sum the two *_transaction fields into total_txns
    hdr = (df_hdr_all
           .dropna(subset=['total_debit_transaction','total_credit_transaction'])
           .groupby('source_file')
           .agg({
               'total_debit_transaction':'sum',
               'total_debit'            :'sum',
               'total_credit_transaction'   :'sum',
               'total_credit'               :'sum'
           })
          )
    hdr = hdr.rename(columns={
        'total_debit_transaction':'total_debit_txns',
        'total_debit'            :'total_debit',
        'total_credit_transaction'   :'total_credit_txns',
        'total_credit'               :'total_credit'
    })


    # STEP 5: merge & compare
    cmp = (hdr
           .join(sums,   how='left')
           .join(counts, how='left')
           .reset_index()
           .rename(columns={'source_file':'file'}))

    summary = cmp.assign(
        debit_amount_match = lambda d: np.isclose(d['total_debit'], d['sum_debit'], atol=1e-2),
        credit_amount_match    = lambda d: np.isclose(d['total_credit'],    d['sum_credit'],    atol=1e-2),
        transaction_count_debit_match = lambda d: d['total_debit_txns'] == (d['count_debit_tx']),
        transaction_count_credit_match = lambda d: d['total_credit_txns'] == (d['count_credit_tx'])
        
    )[
        ['file',
         'total_debit','sum_debit','debit_amount_match',
         'total_credit',   'sum_credit',   'credit_amount_match',
         'total_credit_txns','total_debit_txns',      'count_debit_tx','count_credit_tx','transaction_count_debit_match','transaction_count_credit_match']
    ]

    return summary

if __name__ == "__main__":
    df_tx_all, df_hdr_all, failures = process_folder(INPUT_FOLDER)
    print(f"Processed transactions: {df_tx_all.shape}, headers: {df_hdr_all.shape}")
    if failures:
        print(f"\n⚠️ {len(failures)} failures; inspect `failures` list.")
    if df_tx_all.empty or df_hdr_all.empty:
        print("No data to validate; exiting.")
        exit(1)
    summary = validate_bbl(df_tx_all, df_hdr_all)


    bad = summary.loc[~(summary.debit_amount_match
                        & summary.credit_amount_match
                        & summary.transaction_count_debit_match
                        & summary.transaction_count_credit_match)]
    if not bad.empty:
        bad = bad.assign(
            diff_debit = bad['sum_debit'] - bad['total_debit'],
            diff_credit    = bad['sum_credit']    - bad['total_credit']
        )
        print("\n❌ Files with mismatches:")
        print(bad.to_string(index=False))
    else:
        print("\n✅ All files validated successfully!")


=== Header DataFrame ===
page_id account_number                  period         account_name  total_debit_transaction  total_debit  total_credit_transaction  total_credit address
    1/2   020228559280 01/09/2566 - 30/09/2566 นาย วัฒนชัย ทัพเจริญ                      NaN          NaN                       NaN           NaN        
    2/2   020228559280 01/09/2566 - 30/09/2566 นาย วัฒนชัย ทัพเจริญ                     17.0     65942.00                      20.0      60416.20        
    1/2   020228559280 01/10/2566 - 31/10/2566 นาย วัฒนชัย ทัพเจริญ                      NaN          NaN                       NaN           NaN        
    2/2   020228559280 01/10/2566 - 31/10/2566 นาย วัฒนชัย ทัพเจริญ                     23.0    153192.24                      21.0     149745.27        
    1/3   020228559280 01/11/2566 - 30/11/2566 นาย วัฒนชัย ทัพเจริญ                      NaN          NaN                       NaN           NaN        
    2/3   020228559280 01/11/2566 - 30/11/2566 นาย 

In [7]:
summary

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match
0,108988-02011732-2566_1_GSB.pdf,160234.10,160234.10,True,148171.69,148171.69,True,10.0,266.0,266,10,True,True
1,108988-02011732-2566_2_GSB.pdf,74366.54,74366.54,True,59284.03,59284.03,True,4.0,159.0,159,4,True,True
2,108988-02017082-2566_1_GSB.pdf,207777.43,207777.43,True,213050.64,213050.64,True,18.0,74.0,74,18,True,True
3,108988-02017946-2566_1_GSB.pdf,559175.56,559175.56,True,559505.21,559505.21,True,1022.0,646.0,646,1022,True,True
4,108988-02021300-2566_1_GSB.pdf,324445.91,324445.91,True,324431.82,324431.82,True,81.0,258.0,258,81,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
663,108988-02558641-2567_2_GSB.pdf,603441.92,603441.92,True,603524.95,603524.95,True,153.0,284.0,284,153,True,True
664,108988-02701366-2567_1_GSB.pdf,93697.49,93697.49,True,87697.35,87697.35,True,104.0,153.0,153,104,True,True
665,มีbankอื่นปน4.pdf,225001.61,225001.61,True,224081.00,224081.00,True,34.0,148.0,148,34,True,True
666,มีbankอื่นปน8.pdf,103707.52,103707.52,True,89629.52,89629.52,True,80.0,144.0,144,80,True,True


In [8]:
bad

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match,diff_debit,diff_credit
182,108988-02403717-2566_1_GSB.pdf,1000.0,1000.0,True,0.0,,False,0.0,1.0,1,0,True,True,0.0,
183,108988-02403717-2566_2_GSB.pdf,2000.0,2000.0,True,0.0,,False,0.0,2.0,2,0,True,True,0.0,
425,108988-02489501-2567_1_GSB.pdf,0.0,544410.24,False,0.0,640747.9,False,266.0,636.0,420,218,False,False,544410.24,640747.9


In [4]:
failures

[{'file': 'formatแปลกๆ.pdf',
  'error': "'balance'",
  'traceback': 'Traceback (most recent call last):\n  File "/opt/anaconda3/lib/python3.11/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc\n    return self._engine.get_loc(casted_key)\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc\n  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc\n  File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item\n  File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item\nKeyError: \'balance\'\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/var/folders/y2/6hk1r60n0jn13314j0zgsw880000gq/T/ipykernel_76156/2483886235.py", line 26, in process_folder\n    df_hdr, df_tx = GSBStatementExtractor().run(path,None)\n              