In [None]:
import re
import pdfplumber
import pandas as pd
from typing import Optional, List, Dict, Tuple

from config import (
    CROP_BOUNDS,
    NUMERIC_REGEX,
    DATE_REGEX,
    TIME_REGEX,
    MONEY_REGEX,
    PAGE_ID_REGEX,
    NUMERIC_FIELDS,
    K_DATE_X0,
    K_DATE_X1,
    K_X_SPLIT_DESC_AMOUNT,
    K_X_SPLIT_AMOUNT_BALANCE,
    K_X_SPLIT_BALANCE_CHANNEL,
    K_X_SPLIT_CHANNEL_DETAILS,
    K_X_TOLERANCE,
    K_Y_MARGIN,
    THAI_MONTHS,
    ENG_MONTHS,
)


class TTBExtractor:
    """
    - extract_headers_from_pages(pages) → DataFrame of raw header fields (one row per page)
    - extract_transactions_from_pages(pages) → DataFrame of raw transaction dicts
    - clean_dataframes(raw_headers_df, raw_transactions_df) → (df_header_clean, df_transactions_clean)
    """

    @staticmethod
    def clean_float_column(series: pd.Series) -> pd.Series:
        """
        Remove non-numeric characters, fix minus signs, coerce to float.
        """
        def _clean_value(val):
            if pd.isnull(val):
                return None
            text = re.sub(r"[^\d\.-]", "", str(val))
            if "-" in text:
                text = "-" + text.replace("-", "")
            if "." in text:
                main, *rest = text.split(".")
                text = main + "." + "".join(rest)
            return text

        try:
            return pd.to_numeric(series.astype(str).apply(_clean_value), errors="coerce")
        except Exception as e:
            print(f"⚠️ Float cleaning error: {e}")
            return series

    @staticmethod
    def extract_header_from_page(
        page,
        crop_bounds: Dict[str, Tuple[float, float, float, float]]
    ) -> Dict[str, Optional[str]]:
        """
        Crop out each field in crop_bounds; if numeric, apply NUMERIC_REGEX.
        Returns a dict mapping field_name → raw string (or numeric string if matched).
        """
        header_dict: Dict[str, Optional[str]] = {}
        for field_name, bbox in crop_bounds.items():
            try:
                raw_text = page.crop(bbox).extract_text() or ""
                stripped = raw_text.strip()
                if field_name in NUMERIC_FIELDS:
                    m = NUMERIC_REGEX.search(stripped)
                    header_dict[field_name] = m.group().replace(",", "") if m else None
                else:
                    header_dict[field_name] = stripped
            except Exception as e:
                print(f"⚠️ Error extracting header '{field_name}': {e}")
                header_dict[field_name] = None

        return header_dict

    @staticmethod
    def compute_date_tops(words: List[dict]) -> List[float]:
        """
        Find y-coordinates ("top") of triplets [day, month, year] in the word list.
        """
        tops: List[float] = []
        for i in range(len(words) - 2):
            try:
                d, m, y = words[i], words[i + 1], words[i + 2]
                if (
                    re.match(r"^\d{1,2}$", d["text"])
                    and re.match(r"^[^\s]+$", m["text"])
                    and re.match(r"^\d{2}$", y["text"])
                    and d["x0"] < 100.0
                ):
                    tops.append(d["top"])
            except Exception:
                continue
        return tops

    @staticmethod
    def compute_intervals(date_tops: List[float]) -> List[Tuple[float, float]]:
        """
        Build vertical intervals out of the date_tops list. Each interval covers one row.
        """
        intervals: List[Tuple[float, float]] = []
        for idx, current_top in enumerate(date_tops):
            start = current_top - K_Y_MARGIN
            if idx + 1 < len(date_tops):
                end = date_tops[idx + 1] - K_Y_MARGIN
            else:
                if idx > 0:
                    prev_gap = current_top - date_tops[idx - 1]
                else:
                    prev_gap = K_Y_MARGIN * 2
                end = current_top + prev_gap - K_Y_MARGIN
            intervals.append((start, end))
        return intervals

    @staticmethod
    def assign_words_to_rows(
        words: List[dict],
        intervals: List[Tuple[float, float]]
    ) -> List[List[dict]]:
        """
        Given a list of word dictionaries and row‐intervals, place each word into its row.
        Returns a list of rows, where each row is a list of word dicts.
        """
        rows: List[List[dict]] = [[] for _ in intervals]
        for w in words:
            top_val = w["top"]
            for row_idx, (start, end) in enumerate(intervals):
                if start <= top_val < end:
                    rows[row_idx].append(w)
                    break
        return rows

    @staticmethod
    def split_details_into_date_and_details(details_text: str) -> Tuple[str, str]:
        """
        For a string like "10 ม.ค. 65 Purchase ABC", split off first three tokens as date.
        Returns (date_part, rest_of_details).
        """
        tokens = (details_text or "").split()
        if len(tokens) >= 3:
            date_part = " ".join(tokens[:3])
            rest_part = " ".join(tokens[3:])
        else:
            date_part = details_text
            rest_part = ""
        return date_part, rest_part

    @staticmethod
    def normalize_thai_or_eng_date(date_string: str) -> str:
        """
        Convert "D M Y" (Thai or English) into "YYYY-MM-DD". If parsing fails, return original.
        """
        if not isinstance(date_string, str) or not date_string.strip():
            return date_string

        tokens = date_string.strip().replace("  ", " ").split()
        if len(tokens) == 3:
            day_txt, mon_txt, year_txt = tokens
            is_thai = mon_txt in THAI_MONTHS

            if is_thai:
                mon_num = THAI_MONTHS[mon_txt]
            else:
                mon_num = ENG_MONTHS.get(mon_txt[:3].capitalize(), "01")

            try:
                year_int = int(year_txt)
                if is_thai:
                    if year_int < 100:
                        # "65" → 2565 BE → subtract 543 → 2022 CE
                        year_int = year_int + 2500 - 543
                else:
                    if year_int < 100:
                        year_int = year_int + 2000
                return f"{year_int:04d}-{mon_num}-{int(day_txt):02d}"
            except Exception:
                pass

        return date_string

    def extract_headers_from_pages(self, pages) -> pd.DataFrame:
        """
        Loop over all pages, crop out each header field, build a list of dicts,
        then return pd.DataFrame(raw_header_list).
        """
        header_list: List[Dict[str, Optional[str]]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                # Extract raw header fields via cropping
                header_dict = self.extract_header_from_page(page, CROP_BOUNDS)

                # Derive page_id (e.g. "1/10") from header_dict["page"]
                raw_page_field = header_dict.get("page", "")
                m = PAGE_ID_REGEX.search(raw_page_field)
                page_id = m.group(1) if m else ""
                header_dict["page_id"] = page_id

                # If not page "1/…", blank out all other fields (keep only page_id)
                if not page_id.startswith("1/"):
                    for key in list(header_dict.keys()):
                        if key != "page_id":
                            header_dict[key] = None

                header_list.append(header_dict)

            except Exception as e:
                print(f"⚠️ Skipping header on page {page_index} due to error: {e}")
                # Build an “empty” header dict with all crop keys set to None + page_id=None
                empty_header = {k: None for k in CROP_BOUNDS.keys()}
                empty_header["page_id"] = None
                header_list.append(empty_header)

        df_raw_headers = pd.DataFrame(header_list)
        return df_raw_headers

    def extract_transactions_from_pages(self, pages) -> pd.DataFrame:
        """
        Loop over all pages, find transaction rows via word-level analysis,
        build a list of raw transaction dicts (un-cleaned), then return DataFrame.
        """
        transaction_records: List[Dict[str, Optional[str]]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                words = page.extract_words(use_text_flow=True)

                # Re-extract header to get page_id
                header_dict = self.extract_header_from_page(page, CROP_BOUNDS)
                raw_page_field = header_dict.get("page", "")
                m = PAGE_ID_REGEX.search(raw_page_field)
                page_id = m.group(1) if m else ""

                # Compute row intervals
                date_tops = self.compute_date_tops(words)
                if not date_tops:
                    continue  # no transactions on this page

                intervals = self.compute_intervals(date_tops)
                rows_of_words = self.assign_words_to_rows(words, intervals)

                # Parse each row into a raw dict
                for row_words in rows_of_words:
                    if not row_words:
                        continue

                    try:
                        sorted_row = sorted(row_words, key=lambda w: (w["top"], w["x0"]))

                        date_text = ""
                        time_text = ""
                        debit_amount = None
                        credit_amount = None
                        balance_amount = None
                        description_tokens: List[str] = []
                        channel_tokens: List[str] = []
                        details_tokens: List[str] = []

                        for w in sorted_row:
                            x0 = w["x0"]
                            txt = w["text"]

                            if DATE_REGEX.match(txt) and K_DATE_X0 <= x0 <= K_DATE_X1:
                                date_text = txt
                            elif TIME_REGEX.match(txt):
                                time_text = txt
                            elif MONEY_REGEX.match(txt):
                                val = float(txt.replace(",", ""))
                                if x0 <= K_X_SPLIT_AMOUNT_BALANCE + K_X_TOLERANCE:
                                    if val < 0:
                                        debit_amount = -val
                                    else:
                                        credit_amount = val
                                elif x0 <= K_X_SPLIT_BALANCE_CHANNEL + K_X_TOLERANCE:
                                    balance_amount = val
                            elif K_DATE_X1 + K_X_TOLERANCE < x0 <= K_X_SPLIT_DESC_AMOUNT:
                                description_tokens.append(txt)
                            elif K_X_SPLIT_DESC_AMOUNT + K_X_TOLERANCE < x0 <= K_X_SPLIT_CHANNEL_DETAILS:
                                channel_tokens.append(txt)
                            else:
                                details_tokens.append(txt)

                        transaction_records.append({
                            "page_id": page_id,
                            "date": date_text,
                            "time": time_text,
                            "description": "",  # to be filled in cleaning step
                            "withdrawal": debit_amount,
                            "deposit": credit_amount,
                            "balance": balance_amount,
                            "channel": " ".join(channel_tokens).strip(),
                            "details": " ".join(details_tokens).strip(),
                            "transaction_type": " ".join(description_tokens).strip()
                        })

                    except Exception as row_err:
                        print(f"⚠️ Skipping row on page {page_index} due to error: {row_err}")
                        continue

            except Exception as page_err:
                print(f"⚠️ Skipping page {page_index} due to error: {page_err}")
                continue

        df_raw_transactions = pd.DataFrame(transaction_records)
        return df_raw_transactions

    
    def clean_dataframes(
        self,
        raw_headers_df: pd.DataFrame,
        raw_transactions_df: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        1) Rename header columns to canonical names
        2) Add placeholder 'address'
        3) Reorder header columns
        4) In transactions:
           - Split 'details' into [date, details]
           - Normalize date strings
           - Rename withdrawal→debit, deposit→credit
           - Add placeholder 'code'
           - Reorder transaction columns
           - Clean float columns
        Returns (df_header_clean, df_transactions_clean).
        """
        df_header = raw_headers_df.copy()
        df_transactions = raw_transactions_df.copy()

        # ── HEADER CLEANUP ────────────────────────────────────────────────────────
        df_header = df_header.rename(columns={
            "total_withdrawal": "total_debit",
            "total_deposit": "total_credit",
            "total_withdrawal_transaction": "total_debit_transaction",
            "total_deposit_transaction": "total_credit_transaction",
        })

        # Add placeholder 'address'
        df_header["address"] = ""

        desired_header_cols = [
            "page_id",
            "account_name",
            "account_number",
            "period",
            "total_debit",
            "total_credit",
            "total_debit_transaction",
            "total_credit_transaction",
            "address"
        ]
        # Only keep columns that exist, in the specified order
        df_header = df_header.reindex(columns=desired_header_cols).copy()

        # Clean float columns in header
        float_cols_hdr = [
            "total_debit",
            "total_credit",
            "total_debit_transaction",
            "total_credit_transaction"
        ]
        for col in float_cols_hdr:
            if col in df_header:
                df_header[col] = self.clean_float_column(df_header[col])

        # ── TRANSACTION CLEANUP ───────────────────────────────────────────────────
        if not df_transactions.empty:
            # 1) Split 'details' column into actual date + remainder
            split_series = df_transactions["details"].apply(
                lambda dt: pd.Series(self.split_details_into_date_and_details(dt))
            )
            df_transactions = df_transactions.copy()
            df_transactions[["date", "details"]] = split_series

            # 2) Normalize the 'date' strings (Thai/ENG → YYYY-MM-DD)
            df_transactions["date"] = df_transactions["date"].apply(
                self.normalize_thai_or_eng_date
            )

        # 3) Rename 'withdrawal'→'debit', 'deposit'→'credit'
        df_transactions = df_transactions.rename(columns={
            "withdrawal": "debit",
            "deposit":    "credit"
        })

        # 4) Add placeholder 'code'
        df_transactions["code"] = ""

        desired_txn_cols = [
            "page_id",
            "date",
            "time",
            "code",
            "channel",
            "debit",
            "credit",
            "balance",
            "description",
            "transaction_type"
        ]
        df_transactions = df_transactions.reindex(columns=desired_txn_cols).copy()

        # 5) Clean float columns in transactions
        float_cols_txn = ["debit", "credit", "balance"]
        for col in float_cols_txn:
            if col in df_transactions:
                df_transactions[col] = self.clean_float_column(df_transactions[col])

        return df_header, df_transactions


# ────────────────────────────────────────────────────────────────────────────────
# Example "main" usage:
# (If you run `python ttb_extractor.py` directly, it will print out tables.)
# ────────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import config

    with pdfplumber.open(config.PDF_PATH, password=config.PASSWORD) as pdf:
        pages = pdf.pages

        extractor = TTBExtractor()

        raw_transactions_df = extractor.extract_transactions_from_pages(pages)
        raw_headers_df      = extractor.extract_headers_from_pages(pages)

        df_header_clean, df_transactions_clean = extractor.clean_dataframes(
            raw_headers_df,
            raw_transactions_df
        )

    # Print header DataFrame
    try:
        print("Header Table:")
        print(df_header_clean.to_string(index=False))
    except Exception as e:
        print(f"⚠️ Error printing header table: {e}")

    # Print last few transactions
    try:
        print("\nSample Transactions:")
        print(df_transactions_clean.tail().to_string(index=False))
    except Exception as e:
        print(f"⚠️ Error printing transactions: {e}")


Header Table:
page_id                    account_name account_number              period  total_debit  total_credit  total_debit_transaction  total_credit_transaction address
    1/4 นางสาว กัญญ์ชิสา บัวทองสุขวิทย์  235-2-61470-1 1 Dec 24 - 8 Mar 25     96561.77      87472.52                    121.0                       5.0        
    2/4                            None           None                None          NaN           NaN                      NaN                       NaN        
    3/4                            None           None                None          NaN           NaN                      NaN                       NaN        
    4/4                            None           None                None          NaN           NaN                      NaN                       NaN        

Sample Transactions:
page_id       date  time code channel  debit  credit  balance description      transaction_type
    4/4 2024-12-02 12:29       Mobile  400.0     NaN  4839.67  

In [4]:
import re
import pdfplumber
import pandas as pd
from typing import Optional, List, Dict, Tuple

import config  # import everything; constants are accessed as config.<CONSTANT>


class TTBStatementExtractor:
    """
    - extract_headers_from_pages(pages) → DataFrame of raw header fields (one row per page)
    - extract_transactions_from_pages(pages) → DataFrame of raw transaction dicts
    - clean_dataframes(raw_headers_df, raw_transactions_df) → (df_header_clean, df_transactions_clean)
    """

    @staticmethod
    def clean_float_column(series: pd.Series) -> pd.Series:
        """
        Remove non-numeric characters, fix minus signs, coerce to float.
        """
        def _clean_value(val):
            if pd.isnull(val):
                return None
            text = re.sub(r"[^\d\.-]", "", str(val))
            if "-" in text:
                text = "-" + text.replace("-", "")
            if "." in text:
                main, *rest = text.split(".")
                text = main + "." + "".join(rest)
            return text

        try:
            return pd.to_numeric(series.astype(str).apply(_clean_value), errors="coerce")
        except Exception as e:
            print(f"⚠️ Float cleaning error: {e}")
            return series

    @staticmethod
    def extract_header_from_page(
        page,
        crop_bounds: Dict[str, Tuple[float, float, float, float]]
    ) -> Dict[str, Optional[str]]:
        """
        Crop out each field in crop_bounds; if numeric, apply NUMERIC_REGEX.
        Returns a dict mapping field_name → raw string (or numeric string if matched).
        """
        header_dict: Dict[str, Optional[str]] = {}
        for field_name, bbox in crop_bounds.items():
            try:
                raw_text = page.crop(bbox).extract_text() or ""
                stripped = raw_text.strip()
                if field_name in config.NUMERIC_FIELDS:
                    m = config.NUMERIC_REGEX.search(stripped)
                    header_dict[field_name] = m.group().replace(",", "") if m else None
                else:
                    header_dict[field_name] = stripped
            except Exception as e:
                print(f"⚠️ Error extracting header '{field_name}': {e}")
                header_dict[field_name] = None

        return header_dict

    @staticmethod
    def compute_date_tops(words: List[dict]) -> List[float]:
        """
        Find y-coordinates ("top") of triplets [day, month, year] in the word list.
        """
        tops: List[float] = []
        for i in range(len(words) - 2):
            try:
                d, m, y = words[i], words[i + 1], words[i + 2]
                if (
                    re.match(r"^\d{1,2}$", d["text"])
                    and re.match(r"^[^\s]+$", m["text"])
                    and re.match(r"^\d{2}$", y["text"])
                    and d["x0"] < 100.0
                ):
                    tops.append(d["top"])
            except Exception:
                continue
        return tops

    @staticmethod
    def compute_intervals(date_tops: List[float]) -> List[Tuple[float, float]]:
        """
        Build vertical intervals out of the date_tops list. Each interval covers one row.
        """
        intervals: List[Tuple[float, float]] = []
        for idx, current_top in enumerate(date_tops):
            start = current_top - config.K_Y_MARGIN
            if idx + 1 < len(date_tops):
                end = date_tops[idx + 1] - config.K_Y_MARGIN
            else:
                if idx > 0:
                    prev_gap = current_top - date_tops[idx - 1]
                else:
                    prev_gap = config.K_Y_MARGIN * 2
                end = current_top + prev_gap - config.K_Y_MARGIN
            intervals.append((start, end))
        return intervals

    @staticmethod
    def assign_words_to_rows(
        words: List[dict],
        intervals: List[Tuple[float, float]]
    ) -> List[List[dict]]:
        """
        Given a list of word dictionaries and row‐intervals, place each word into its row.
        Returns a list of rows, where each row is a list of word dicts.
        """
        rows: List[List[dict]] = [[] for _ in intervals]
        for w in words:
            top_val = w["top"]
            for row_idx, (start, end) in enumerate(intervals):
                if start <= top_val < end:
                    rows[row_idx].append(w)
                    break
        return rows

    @staticmethod
    def split_details_into_date_and_details(details_text: str) -> Tuple[str, str]:
        """
        For a string like "10 ม.ค. 65 Purchase ABC", split off first three tokens as date.
        Returns (date_part, rest_of_details).
        """
        tokens = (details_text or "").split()
        if len(tokens) >= 3:
            date_part = " ".join(tokens[:3])
            rest_part = " ".join(tokens[3:])
        else:
            date_part = details_text
            rest_part = ""
        return date_part, rest_part

    @staticmethod
    def normalize_thai_or_eng_date(date_string: str) -> str:
        """
        Convert "D M Y" (Thai or English) into "YYYY-MM-DD". If parsing fails, return original.
        """
        if not isinstance(date_string, str) or not date_string.strip():
            return date_string

        tokens = date_string.strip().replace("  ", " ").split()
        if len(tokens) == 3:
            day_txt, mon_txt, year_txt = tokens
            is_thai = mon_txt in config.THAI_MONTHS

            if is_thai:
                mon_num = config.THAI_MONTHS[mon_txt]
            else:
                mon_num = config.ENG_MONTHS.get(mon_txt[:3].capitalize(), "01")

            try:
                year_int = int(year_txt)
                if is_thai:
                    if year_int < 100:
                        # "65" → 2565 BE → subtract 543 → 2022 CE
                        year_int = year_int + 2500 - 543
                else:
                    if year_int < 100:
                        year_int = year_int + 2000
                return f"{year_int:04d}-{mon_num}-{int(day_txt):02d}"
            except Exception:
                pass

        return date_string

    @staticmethod
    def extract_headers_from_pages(pages) -> pd.DataFrame:
        """
        Loop over all pages, crop out each header field, build a list of dicts,
        then return pd.DataFrame(raw_header_list).
        """
        header_list: List[Dict[str, Optional[str]]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                # Extract raw header fields via cropping
                header_dict = TTBStatementExtractor.extract_header_from_page(page, config.CROP_BOUNDS)

                # Derive page_id (e.g. "1/10") from header_dict["page"]
                raw_page_field = header_dict.get("page", "")
                m = config.PAGE_ID_REGEX.search(raw_page_field)
                page_id = m.group(1) if m else ""
                header_dict["page_id"] = page_id

                # If not page "1/…", blank out all other fields (keep only page_id)
                if not page_id.startswith("1/"):
                    for key in list(header_dict.keys()):
                        if key != "page_id":
                            header_dict[key] = None

                header_list.append(header_dict)

            except Exception as e:
                print(f"⚠️ Skipping header on page {page_index} due to error: {e}")
                # Build an “empty” header dict with all crop keys set to None + page_id=None
                empty_header = {k: None for k in config.CROP_BOUNDS.keys()}
                empty_header["page_id"] = None
                header_list.append(empty_header)

        df_raw_headers = pd.DataFrame(header_list)
        return df_raw_headers

    @staticmethod
    def extract_transactions_from_pages(pages) -> pd.DataFrame:
        """
        Loop over all pages, find transaction rows via word-level analysis,
        build a list of raw transaction dicts (un-cleaned), then return DataFrame.
        """
        transaction_records: List[Dict[str, Optional[str]]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                words = page.extract_words(use_text_flow=True)

                # Re-extract header to get page_id
                header_dict = TTBStatementExtractor.extract_header_from_page(page, config.CROP_BOUNDS)
                raw_page_field = header_dict.get("page", "")
                m = config.PAGE_ID_REGEX.search(raw_page_field)
                page_id = m.group(1) if m else ""

                # Compute row intervals
                date_tops = TTBStatementExtractor.compute_date_tops(words)
                if not date_tops:
                    continue  # no transactions on this page

                intervals = TTBStatementExtractor.compute_intervals(date_tops)
                rows_of_words = TTBStatementExtractor.assign_words_to_rows(words, intervals)

                # Parse each row into a raw dict
                for row_words in rows_of_words:
                    if not row_words:
                        continue

                    try:
                        sorted_row = sorted(row_words, key=lambda w: (w["top"], w["x0"]))

                        date_text = ""
                        time_text = ""
                        debit_amount = None
                        credit_amount = None
                        balance_amount = None
                        description_tokens: List[str] = []
                        channel_tokens: List[str] = []
                        details_tokens: List[str] = []

                        for w in sorted_row:
                            x0 = w["x0"]
                            txt = w["text"]

                            if config.DATE_REGEX.match(txt) and config.K_DATE_X0 <= x0 <= config.K_DATE_X1:
                                date_text = txt
                            elif config.TIME_REGEX.match(txt):
                                time_text = txt
                            elif config.MONEY_REGEX.match(txt):
                                val = float(txt.replace(",", ""))
                                if x0 <= config.K_X_SPLIT_AMOUNT_BALANCE + config.K_X_TOLERANCE:
                                    if val < 0:
                                        debit_amount = -val
                                    else:
                                        credit_amount = val
                                elif x0 <= config.K_X_SPLIT_BALANCE_CHANNEL + config.K_X_TOLERANCE:
                                    balance_amount = val
                            elif config.K_DATE_X1 + config.K_X_TOLERANCE < x0 <= config.K_X_SPLIT_DESC_AMOUNT:
                                description_tokens.append(txt)
                            elif config.K_X_SPLIT_DESC_AMOUNT + config.K_X_TOLERANCE < x0 <= config.K_X_SPLIT_CHANNEL_DETAILS:
                                channel_tokens.append(txt)
                            else:
                                details_tokens.append(txt)

                        transaction_records.append({
                            "page_id": page_id,
                            "date": date_text,
                            "time": time_text,
                            "description": "",  # to be filled in cleaning step
                            "withdrawal": debit_amount,
                            "deposit": credit_amount,
                            "balance": balance_amount,
                            "channel": " ".join(channel_tokens).strip(),
                            "details": " ".join(details_tokens).strip(),
                            "transaction_type": " ".join(description_tokens).strip()
                        })

                    except Exception as row_err:
                        print(f"⚠️ Skipping row on page {page_index} due to error: {row_err}")
                        continue

            except Exception as page_err:
                print(f"⚠️ Skipping page {page_index} due to error: {page_err}")
                continue

        df_raw_transactions = pd.DataFrame(transaction_records)
        return df_raw_transactions

    @staticmethod
    def clean_dataframes(
        raw_headers_df: pd.DataFrame,
        raw_transactions_df: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        1) Rename header columns to canonical names
        2) Add placeholder 'address'
        3) Reorder header columns
        4) In transactions:
           - Split 'details' into [date, details]
           - Normalize date strings
           - Rename withdrawal→debit, deposit→credit
           - Add placeholder 'code'
           - Reorder transaction columns
           - Clean float columns
        Returns (df_header_clean, df_transactions_clean).
        """
        df_header = raw_headers_df.copy()
        df_transactions = raw_transactions_df.copy()

        # ── HEADER CLEANUP ────────────────────────────────────────────────────────
        df_header = df_header.rename(columns={
            "total_withdrawal": "total_debit",
            "total_deposit": "total_credit",
            "total_withdrawal_transaction": "total_debit_transaction",
            "total_deposit_transaction": "total_credit_transaction",
        })

        # Add placeholder 'address'
        df_header["address"] = ""

        desired_header_cols = [
            "page_id",
            "account_name",
            "account_number",
            "period",
            "total_debit",
            "total_credit",
            "total_debit_transaction",
            "total_credit_transaction",
            "address"
        ]
        # Only keep columns that exist, in the specified order
        df_header = df_header.reindex(columns=desired_header_cols).copy()

        # Clean float columns in header
        float_cols_hdr = [
            "total_debit",
            "total_credit",
            "total_debit_transaction",
            "total_credit_transaction"
        ]
        for col in float_cols_hdr:
            if col in df_header:
                df_header[col] = TTBStatementExtractor.clean_float_column(df_header[col])

        # ── TRANSACTION CLEANUP ───────────────────────────────────────────────────
        if not df_transactions.empty:
            # 1) Split 'details' column into actual date + remainder
            split_series = df_transactions["details"].apply(
                lambda dt: pd.Series(TTBStatementExtractor.split_details_into_date_and_details(dt))
            )
            df_transactions = df_transactions.copy()
            df_transactions[["date", "details"]] = split_series

            # 2) Normalize the 'date' strings (Thai/ENG → YYYY-MM-DD)
            df_transactions["date"] = df_transactions["date"].apply(
                TTBStatementExtractor.normalize_thai_or_eng_date
            )

        # 3) Rename 'withdrawal'→'debit', 'deposit'→'credit'
        df_transactions = df_transactions.rename(columns={
            "withdrawal": "debit",
            "deposit":    "credit"
        })

        # 4) Add placeholder 'code'
        df_transactions["code"] = ""

        desired_txn_cols = [
            "page_id",
            "date",
            "time",
            "code",
            "channel",
            "debit",
            "credit",
            "balance",
            "description",
            "transaction_type"
        ]
        df_transactions = df_transactions.reindex(columns=desired_txn_cols).copy()

        # 5) Clean float columns in transactions
        float_cols_txn = ["debit", "credit", "balance"]
        for col in float_cols_txn:
            if col in df_transactions:
                df_transactions[col] = TTBStatementExtractor.clean_float_column(df_transactions[col])

        return df_header, df_transactions

    def run(self, pdf_path: str, password: Optional[str] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Opens the PDF at `pdf_path` (with optional `password`), extracts raw headers
        and transactions, cleans them, and returns (df_header_clean, df_transactions_clean).
        """
        with pdfplumber.open(pdf_path, password=password) as pdf:
            pages = pdf.pages

            raw_headers_df = TTBStatementExtractor.extract_headers_from_pages(pages)
            raw_transactions_df = TTBStatementExtractor.extract_transactions_from_pages(pages)
            df_header_clean, df_transactions_clean = TTBStatementExtractor.clean_dataframes(
                raw_headers_df,
                raw_transactions_df
            )

        return df_header_clean, df_transactions_clean


# ────────────────────────────────────────────────────────────────────────────────
# Example "main" usage:
# (If you run `python ttb_statement_extractor.py` directly, it will print out tables.)
# ────────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import config

    extractor = TTBStatementExtractor()
    df_header_clean, df_transactions_clean = extractor.run(config.PDF_PATH, config.PASSWORD)

    # Print header DataFrame
    try:
        print("Header Table:")
        print(df_header_clean.to_string(index=False))
    except Exception as e:
        print(f"⚠️ Error printing header table: {e}")

    # Print last few transactions
    try:
        print("\nSample Transactions:")
        print(df_transactions_clean.tail().to_string(index=False))
    except Exception as e:
        print(f"⚠️ Error printing transactions: {e}")


Header Table:
page_id                    account_name account_number              period  total_debit  total_credit  total_debit_transaction  total_credit_transaction address
    1/4 นางสาว กัญญ์ชิสา บัวทองสุขวิทย์  235-2-61470-1 1 Dec 24 - 8 Mar 25     96561.77      87472.52                    121.0                       5.0        
    2/4                            None           None                None          NaN           NaN                      NaN                       NaN        
    3/4                            None           None                None          NaN           NaN                      NaN                       NaN        
    4/4                            None           None                None          NaN           NaN                      NaN                       NaN        

Sample Transactions:
page_id       date  time code channel  debit  credit  balance description      transaction_type
    4/4 2024-12-02 12:29       Mobile  400.0     NaN  4839.67  

In [None]:
# ────────────────────────────────────────────────────────────────────────────────
# Example "main" usage:
# (If you run `python ttb_extractor.py` directly, it will print out tables.)
# ────────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import config

    extractor = TTBExtractor()
    df_header_clean, df_transactions_clean = extractor.run(config.PDF_PATH, config.PASSWORD)

    # Print header DataFrame
    try:
        print("Header Table:")
        print(df_header_clean.to_string(index=False))
    except Exception as e:
        print(f"⚠️ Error printing header table: {e}")

    # Print last few transactions
    try:
        print("\nSample Transactions:")
        print(df_transactions_clean.tail().to_string(index=False))
    except Exception as e:
        print(f"⚠️ Error printing transactions: {e}")


In [1]:
#!/usr/bin/env python3
import os, traceback
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any, Optional
import pdfplumber
from ttb_without_detail_extract import TTBStatementExtractor
# — your existing imports & extraction functions here —
# from your_module import extract_header, extract_transactions

INPUT_FOLDER = "/Users/if658228/Downloads/TTB/no_note"
PASSWORD: Optional[str] = None

def process_folder(input_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict]]:
    tx_list, hdr_list, failures = [], [], []
    for fn in os.listdir(input_folder):
        if not fn.lower().endswith(".pdf"):
            continue
        path = os.path.join(input_folder, fn)
        try:
            with pdfplumber.open(path, password=PASSWORD) as pdf:
                pages = pdf.pages
                df_hdr, df_tx = TTBStatementExtractor.run(path,None)

                df_hdr["source_file"] = fn
                df_tx ["source_file"] = fn
                tx_list.append(df_tx)
                hdr_list.append(df_hdr)
        except Exception as e:
            failures.append({
                "file": fn,
                "error": str(e),
                "traceback": traceback.format_exc()
            })
    all_tx  = pd.concat(tx_list, ignore_index=True) if tx_list else pd.DataFrame()
    all_hdr = pd.concat(hdr_list, ignore_index=True) if hdr_list else pd.DataFrame()
    return all_tx, all_hdr, failures

def validate_bbl(df_tx_all: pd.DataFrame, df_hdr_all: pd.DataFrame) -> pd.DataFrame:
    # STEP 1: ensure tx columns are numeric
    tx_clean = df_tx_all.copy()
    tx_clean[['debit','credit']] = (
        tx_clean[['debit','credit']]
        .replace('', np.nan)
        .astype(float)
    )

    # STEP 1.5: ensure header columns are numeric
    for col in ['total_debit_transaction',
                'total_debit',
                'total_credit_transaction',
                'total_credit']:
        df_hdr_all[col] = (
            pd.to_numeric(
                df_hdr_all[col].replace('', np.nan).astype(str).str.replace(',', ''),
                errors='coerce'
            )
        )

    # (then proceed with sums/counts as before)
    sums = (
        tx_clean
        .groupby('source_file')[['debit','credit']]
        .sum(min_count=1)
        .rename(columns={'debit':'sum_debit','credit':'sum_credit'})
    )

    counts = (
        tx_clean
        .groupby('source_file')[['debit','credit']]
        .count()
        .rename(columns={'debit':'count_debit_tx','credit':'count_credit_tx'})
    )

    hdr = (
        df_hdr_all
        .dropna(subset=['total_debit_transaction','total_credit_transaction'])
        .groupby('source_file')
        .agg({
            'total_debit_transaction':'sum',
            'total_debit':'sum',
            'total_credit_transaction':'sum',
            'total_credit':'sum'
        })
    ).rename(columns={
        'total_debit_transaction':'total_debit_txns',
        'total_debit':'total_debit',
        'total_credit_transaction':'total_credit_txns',
        'total_credit':'total_credit'
    })

    cmp = (
        hdr
        .join(sums, how='left')
        .join(counts, how='left')
        .reset_index()
        .rename(columns={'source_file':'file'})
    )

    summary = cmp.assign(
        debit_amount_match = lambda d: np.isclose(d['total_debit'], d['sum_debit'], atol=1e-2),
        credit_amount_match = lambda d: np.isclose(d['total_credit'], d['sum_credit'], atol=1e-2),
        transaction_count_debit_match = lambda d: d['total_debit_txns'] == d['count_debit_tx'],
        transaction_count_credit_match = lambda d: d['total_credit_txns'] == d['count_credit_tx']
    )[
        ['file',
         'total_debit','sum_debit','debit_amount_match',
         'total_credit','sum_credit','credit_amount_match',
         'total_credit_txns','total_debit_txns','count_debit_tx','count_credit_tx',
         'transaction_count_debit_match','transaction_count_credit_match']
    ]

    return summary


if __name__ == "__main__":
    df_tx_all, df_hdr_all, failures = process_folder(INPUT_FOLDER)
    print(f"Processed transactions: {df_tx_all.shape}, headers: {df_hdr_all.shape}")
    if failures:
        print(f"\n⚠️ {len(failures)} failures; inspect `failures` list.")
    if df_tx_all.empty or df_hdr_all.empty:
        print("No data to validate; exiting.")
        exit(1)
    print(df_tx_all,df_hdr_all)
    summary = validate_bbl(df_tx_all, df_hdr_all)
    print("\n--- Validation Summary per File ---")
    print(summary.to_string(index=False))

    bad = summary.loc[~(summary.debit_amount_match
                        & summary.credit_amount_match
                        & summary.transaction_count_debit_match
                        & summary.transaction_count_credit_match)]
    if not bad.empty:
        bad = bad.assign(
            diff_debit = bad['sum_debit'] - bad['total_debit'],
            diff_credit    = bad['sum_credit']    - bad['total_credit']
        )
        print("\n❌ Files with mismatches:")
        print(bad.to_string(index=False))
    else:
        print("\n✅ All files validated successfully!")


Processed transactions: (89896, 11), headers: (2586, 10)
      page_id        date   time code channel   debit  credit   balance  \
0        1/11  2025-03-07  18:22       Mobile  6000.0     NaN  11044.10   
1        1/11  2025-03-07  12:23       Mobile    80.0     NaN  17044.10   
2        1/11  2025-03-07  12:21       Mobile   100.0     NaN  17124.10   
3        1/11  2025-03-07  12:18       Mobile    40.0     NaN  17224.10   
4        1/11  2025-03-06  16:58       Mobile   100.0     NaN  17264.10   
...       ...         ...    ...  ...     ...     ...     ...       ...   
89891     9/9  2024-12-03  08:56       Mobile   100.0     NaN    283.46   
89892     9/9  2024-12-03  08:08       Mobile     NaN   200.0    383.46   
89893     9/9  2024-12-02  18:34       Mobile   100.0     NaN    183.46   
89894     9/9  2024-12-01  07:58       Mobile   100.0     NaN    283.46   
89895     9/9  2024-12-01  07:54       Mobile   200.0     NaN    383.46   

      description           transaction_ty

In [2]:
bad

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match


In [3]:
summary

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match
0,108988-03995675-2568_0_TTB.pdf,307717.01,307717.01,True,300254.00,300254.00,True,40.0,288.0,288,40,True,True
1,108988-03995675-2568_1_TTB.pdf,307717.01,307717.01,True,300254.00,300254.00,True,40.0,288.0,288,40,True,True
2,108988-03995798-2568_0_TTB.pdf,814763.24,814763.24,True,817393.15,817393.15,True,55.0,302.0,302,55,True,True
3,108988-03995798-2568_1_TTB.pdf,814763.24,814763.24,True,817393.15,817393.15,True,55.0,302.0,302,55,True,True
4,108988-03995970-2568_0_TTB.pdf,1075915.27,1075915.27,True,1064839.84,1064839.84,True,152.0,488.0,488,152,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,108988-04052511-2568_0_TTB.pdf,281572.77,281572.77,True,275852.98,275852.98,True,80.0,231.0,231,80,True,True
265,108988-04073052-2568_0_TTB.pdf,122505.63,122505.63,True,121224.04,121224.04,True,93.0,433.0,433,93,True,True
266,108988-04099021-2568_0_TTB.pdf,371358.44,371358.44,True,368716.77,368716.77,True,52.0,172.0,172,52,True,True
267,108988-04105755-2568_0_TTB.pdf,628227.22,628227.22,True,618098.31,618098.31,True,122.0,306.0,306,122,True,True
