In [24]:
# extractor.py

import re
import pdfplumber
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional, Tuple
from tabulate import tabulate

# Import everything from config at once—our static methods will refer to config.<CONSTANT>
import config


class BBLStatementExtractor:
    """
    Extract headers and transactions from a BBL PDF statement.

    All helper methods are @staticmethod.  Only run() accepts
    a pdf_path and password as arguments.
    """

    # Compile regexes once at the class level so static methods can use them:
    _date_regex = re.compile(config.DATE_REGEX_PATTERN)
    _time_regex = re.compile(config.TIME_REGEX_PATTERN)
    _money_regex = re.compile(config.MONEY_REGEX_PATTERN)
    _page_regex = re.compile(config.PAGE_REGEX_PATTERN)

    @staticmethod
    def extract_page_id(page: pdfplumber.page.Page) -> str:
        """
        Crop and parse the page ID for pagination like '1/5'.
        Relies on config.PAGE_REGEX_PATTERN to find something like '1/5' at the bottom.
        """
        try:
            width, height = page.width, page.height
            # coordinates: (x0, y0, x1, y1)
            crop_box = (width - 180, height - 25, width, height)
            raw_text = page.crop(crop_box).extract_text() or ""
            single_line = raw_text.replace("\n", " ")
            match = BBLStatementExtractor._page_regex.search(single_line)
            return f"{match.group(1)}/{match.group(2)}" if match else ""
        except Exception:
            return ""

    @staticmethod
    def extract_headers(pages: List[pdfplumber.page.Page]) -> List[Dict[str, Any]]:
        """
        Extract header information from each page.
        Each page is cropped at various boxes (config.CROPS) to pull out fields
        like account number, statement date, etc.  Then look for FOOTER_KEYS
        (e.g. "จํานวนรายการถอน" / "จํานวนรายการฝาก") to grab totals.
        """
        header_records: List[Dict[str, Any]] = []

        for page in pages:
            page_id = BBLStatementExtractor.extract_page_id(page)
            try:
                full_text = page.extract_text() or ""
                record: Dict[str, Any] = {"page_id": page_id}

                # For each field defined in config.CROPS, crop and strip text.
                for field_name, box in config.CROPS.items():
                    text = page.crop(box).extract_text() or ""
                    record[field_name] = text.strip().replace("\n", " ")

                # If any of the FOOTER_KEYS appear in the page text, parse totals.
                if any(key in full_text for key in config.FOOTER_KEYS):
                    for line in full_text.splitlines():
                        if line.startswith("จํานวนรายการถอน"):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            record["total_withdrawal_transaction"] = (
                                float(numbers[0].replace(",", "")) if len(numbers) > 0 else None
                            )
                            record["total_withdrawal"] = (
                                float(numbers[1].replace(",", "")) if len(numbers) > 1 else None
                            )
                        elif line.startswith("จํานวนรายการฝาก"):
                            numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            record["total_deposit_transaction"] = (
                                float(numbers[0].replace(",", "")) if len(numbers) > 0 else None
                            )
                            record["total_deposit"] = (
                                float(numbers[1].replace(",", "")) if len(numbers) > 1 else None
                            )

                header_records.append(record)

            except Exception as error:
                print(f"⚠️  Skipping page {page_id} due to error: {error}")
                continue

        return header_records

    @staticmethod
    def extract_transactions(pages: List[pdfplumber.page.Page]) -> List[Dict[str, Any]]:
        """
        Extract transaction rows from the table region on each page. 
        We first detect all words in the table crop (config.TABLE_CROP_BOX),
        group them into rows whenever we see a DATE_REGEX match, then parse out
        date, time, description, channel, debit/credit, and balance.
        """
        transaction_records: List[Dict[str, Any]] = []

        for page in pages:
            try:
                page_id = BBLStatementExtractor.extract_page_id(page)

                # Take the crop box defined in config.TABLE_CROP_BOX,
                # but make sure it fits within the page dimensions:
                crop_x0, crop_y0, crop_x1, crop_y1 = config.TABLE_CROP_BOX
                page_width, page_height = page.width, page.height
                safe_crop_box: Tuple[float, float, float, float] = (
                    crop_x0,
                    crop_y0,
                    min(crop_x1, page_width),
                    min(crop_y1, page_height),
                )
                region = page.crop(safe_crop_box)

                # Extract words with a small tolerance so we can cluster them into rows:
                words = region.extract_words(
                    x_tolerance=3, 
                    y_tolerance=3, 
                    use_text_flow=True
                )

                # Find every 'top' coordinate where the text matches a date pattern:
                row_tops = sorted(
                    w["top"] 
                    for w in words 
                    if BBLStatementExtractor._date_regex.match(w["text"])
                )
                print(f"Page {page_id}: found {len(row_tops)} date entries")

                if not row_tops:
                    # No date => no transactions on this page
                    continue

                # Build intervals for each row: each interval spans from row_tops[i] - Y_MARGIN
                # up to row_tops[i+1] - Y_MARGIN (or +15px at the end).
                intervals = [
                    (
                        row_tops[i] - config.Y_MARGIN,
                        row_tops[i + 1] - config.Y_MARGIN 
                        if i + 1 < len(row_tops) 
                        else row_tops[i] + 15,
                    )
                    for i in range(len(row_tops))
                ]

                # Now allocate each word into the correct row index:
                rows: List[List[Dict[str, Any]]] = [[] for _ in intervals]
                for w in words:
                    for idx, (top_min, top_max) in enumerate(intervals):
                        if top_min <= w["top"] < top_max:
                            rows[idx].append(w)
                            break

                # For each row group, parse date, time, description, channel, amounts, etc.
                for idx, row in enumerate(rows):
                    if not row:
                        continue

                    row_text = " ".join(w["text"] for w in row)
                    # If this row matches any FOOTER_KEYS, skip it:
                    if any(key in row_text for key in config.FOOTER_KEYS):
                        continue

                    # Sort words by their vertical (then horizontal) positions:
                    sorted_row = sorted(row, key=lambda w: (w["top"], w["x0"]))

                    # Find the date word in this row
                    date_word = next(
                        (w for w in sorted_row if BBLStatementExtractor._date_regex.match(w["text"])),
                        None,
                    )
                    date_value = (
                        pd.to_datetime(
                            date_word["text"], 
                            format="%d/%m/%y", 
                            dayfirst=True, 
                            errors="coerce"
                        ) 
                        if date_word 
                        else None
                    )

                    # Find a time word on nearly the same 'top' line:
                    time_word = next(
                        (
                            w 
                            for w in sorted_row 
                            if BBLStatementExtractor._time_regex.match(w["text"]) 
                            and abs(w["top"] - (date_word["top"] if date_word else 0)) < 20
                        ),
                        None,
                    )
                    time_value = time_word["text"] if time_word else ""

                    # We will bucket tokens into: description_tokens, channel_via_tokens, withdrawal_tokens, balance_tokens
                    description_tokens: List[str] = []
                    channel_via_tokens: List[str] = []
                    withdrawal_tokens: List[Tuple[str, float]] = []
                    balance_tokens: List[str] = []

                    for w in sorted_row:
                        text, x0, x1 = w["text"], w["x0"], w["x1"]

                        # Skip over actual date/time tokens
                        if BBLStatementExtractor._date_regex.match(text) or BBLStatementExtractor._time_regex.match(text):
                            continue

                        # If it looks like money (e.g. "1,234.56"):
                        if BBLStatementExtractor._money_regex.match(text):
                            # If it falls to the left of the CHANNEL/DC split:
                            if x1 <= config.X_SPLIT_CHANNEL_DC + config.X_TOLERANCE:
                                withdrawal_tokens.append((text, x1))
                            else:
                                balance_tokens.append(text)
                            continue

                        # If x0 is very far left, treat it as description
                        if x0 <= config.X_SPLIT_CODE_CHANNEL + config.X_TOLERANCE:
                            description_tokens.append(text)
                        # If x0 is far right, treat it as channel/via
                        elif x0 >= config.X_SPLIT_CHANNEL_VIA - config.X_TOLERANCE:
                            channel_via_tokens.append(text)
                        else:
                            description_tokens.append(text)

                    channel_value = " ".join(channel_via_tokens).strip()

                    withdrawal_amount = None
                    deposit_amount = None
                    # Among all withdrawal_tokens, decide which one is 'withdrawal' vs 'deposit'
                    for value_str, x1 in withdrawal_tokens:
                        value = float(value_str.replace(",", ""))
                        if x1 <= config.X_SPLIT_WITHDRAW_DEP + config.X_TOLERANCE:
                            withdrawal_amount = value
                        else:
                            deposit_amount = value

                    balance_amount = (
                        float(balance_tokens[-1].replace(",", "")) 
                        if balance_tokens 
                        else None
                    )

                    transaction_records.append({
                        "page_id": page_id,
                        "date": date_value,
                        "time": time_value,
                        "description": "",
                        "channel": channel_value,
                        "withdrawal": withdrawal_amount,
                        "deposit": deposit_amount,
                        "balance": balance_amount,
                        "transaction_type": " ".join(description_tokens).strip(),
                    })

                print(f"Page {page_id}: extracted {len(transaction_records)} transactions so far")

            except Exception as error:
                print(f"⚠️ Skipping transactions on page {getattr(page, 'page_number', '?')}: {error}")
                continue

        return transaction_records

    @staticmethod
    def clean_all_floats(
        header_dataframe: pd.DataFrame,
        transaction_dataframe: pd.DataFrame,
    ) -> None:
        """
        Clean and convert all numeric text columns to floats in both DataFrames.
        This mutates header_dataframe and transaction_dataframe in-place.
        """
        float_columns_transactions = ["withdrawal", "deposit", "balance"]
        for column in float_columns_transactions:
            if column in transaction_dataframe.columns:
                transaction_dataframe[column] = (
                    transaction_dataframe[column]
                    .astype(str)
                    .str.replace(r"[^\d.]", "", regex=True)
                    .replace({"": np.nan, "NA": np.nan})
                    .astype(float)
                )

        float_columns_headers = [
            "total_withdrawal_transaction",
            "total_withdrawal",
            "total_deposit_transaction",
            "total_deposit",
        ]
        for column in float_columns_headers:
            if column in header_dataframe.columns:
                header_dataframe[column] = (
                    header_dataframe[column]
                    .astype(str)
                    .str.replace(r"[^\d.]", "", regex=True)
                    .replace({"": np.nan, "NA": np.nan})
                    .astype(float)
                )

    @staticmethod
    def build_and_clean_dataframes(
        header_records: List[Dict[str, Any]],
        transaction_records: List[Dict[str, Any]],
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        1. Build DataFrames from raw header_records and transaction_records.
        2. Drop blank or malformed page_ids.
        3. Rename columns so 'withdrawal'→'debit', 'deposit'→'credit', etc.
        4. Reorder columns, then clean numeric fields via clean_all_floats().
        """
        header_dataframe = pd.DataFrame(header_records)
        transaction_dataframe = pd.DataFrame(transaction_records)

        # Drop any header rows where page_id is blank or whitespace
        header_dataframe = header_dataframe[
            header_dataframe["page_id"].str.strip() != ""
        ].copy()

        # Ensure an 'address' column always exists (some statements may or may not have it)
        header_dataframe["address"] = ""

        # Rename total fields
        header_dataframe = header_dataframe.rename(
            columns={
                "total_withdrawal": "total_debit",
                "total_deposit": "total_credit",
                "total_withdrawal_transaction": "total_debit_transaction",
                "total_deposit_transaction": "total_credit_transaction",
            }
        ).reset_index(drop=True)

        # Filter out any page_ids that don't start with a digit (malformed)
        header_dataframe = header_dataframe[
            header_dataframe["page_id"].str.match(r"^\d", na=False)
        ].copy()

        # Keep only rows in transaction_dataframe that have either a withdrawal or deposit
        transaction_dataframe = transaction_dataframe[
            (~transaction_dataframe["withdrawal"].isnull())
            | (~transaction_dataframe["deposit"].isnull())
        ].copy()

        # Rename columns to standard names:
        transaction_dataframe = transaction_dataframe.rename(
            columns={"withdrawal": "debit", "deposit": "credit"}
        )

        # Reorder columns into a fixed schema
        transaction_dataframe = transaction_dataframe[
            [
                "page_id",
                "date",
                "time",
                "description",
                "channel",
                "debit",
                "credit",
                "balance",
                "transaction_type",
            ]
        ].copy()

        # Finally, clean all the numeric columns in both DataFrames
        BBLStatementExtractor.clean_all_floats(header_dataframe, transaction_dataframe)

        return header_dataframe, transaction_dataframe

    @staticmethod
    def run(
        pdf_path: str,
        password: Optional[str] = None,
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Execute extraction and return cleaned DataFrames.
        This is the only method that requires you to pass in the PDF path and password.
        """
        with pdfplumber.open(pdf_path, password=password) as pdf:
            pages = pdf.pages
            header_list = BBLStatementExtractor.extract_headers(pages)
            transaction_list = BBLStatementExtractor.extract_transactions(pages)

        return BBLStatementExtractor.build_and_clean_dataframes(
            header_list, transaction_list
        )


# Example usage:
# ---------------------------------------------
# from extractor import BBLStatementExtractor
#
# pdf_file = "/path/to/your/BBL_statement.pdf"
# pwd = "your_optional_password"
# headers_df, transactions_df = BBLStatementExtractor.run(pdf_file, pwd)
#
# print(f"Total headers extracted: {len(headers_df)}")
# print(tabulate(headers_df, headers='keys', tablefmt='psql', showindex=False))
# print(f"Total transactions extracted: {len(transactions_df)}")
# print(tabulate(transactions_df, headers='keys', tablefmt='psql', showindex=False))


In [19]:
headers_df

Unnamed: 0,page_id,account_name,account_number,period,total_debit_transaction,total_debit,total_credit_transaction,total_credit,address
0,1/2,น.ส. มัณฑนา ปณะจิตร,093-7-33805-1,01/05/2023 - 18/08/2023,,,,,
1,2/2,น.ส. มัณฑนา ปณะจิตร,093-7-33805-1,01/05/2023 - 18/08/2023,44.0,62325.0,23.0,62324.83,


In [20]:
transactions_df

Unnamed: 0,page_id,date,time,description,channel,debit,credit,balance,transaction_type
1,1/2,2023-05-12,,,Auto,,9625.54,9626.23,SALARY
2,1/2,2023-05-12,,,mPhone,9626.0,,0.23,TRF. PROMPTPAY
3,1/2,2023-05-19,,,mPhone,,5000.00,5000.23,TRF FR OTH BK
4,1/2,2023-05-19,,,mPhone,5000.0,,0.23,TRF TO OTH BK
5,1/2,2023-05-24,,,mPhone,,500.00,500.23,TRF FR OTH BK
...,...,...,...,...,...,...,...,...,...
63,2/2,2023-08-11,,,mPhone,1000.0,,4679.52,TRF. PROMPTPAY
64,2/2,2023-08-11,,,mPhone,,1000.00,5679.52,TRF FR OTH BK
65,2/2,2023-08-12,,,mPhone,179.0,,5500.52,TRF. PROMPTPAY
66,2/2,2023-08-12,,,mPhone,200.0,,5300.52,TRF. PROMPTPAY


In [1]:
#!/usr/bin/env python3
import os, traceback
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any, Optional
import pdfplumber
from bbl_extractor import BBLStatementExtractor

# — your existing imports & extraction functions here —
# from your_module import extract_header, extract_transactions

INPUT_FOLDER = "/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/BBL"
PASSWORD: Optional[str] = None

def process_folder(input_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict]]:
    tx_list, hdr_list, failures = [], [], []
    for fn in os.listdir(input_folder):
        if not fn.lower().endswith(".pdf"):
            continue
        path = os.path.join(input_folder, fn)
        try:
            with pdfplumber.open(path, password=PASSWORD) as pdf:
                pages = pdf.pages
                df_hdr, df_tx = BBLStatementExtractor().run(path,None)
            df_hdr["source_file"] = fn
            df_tx ["source_file"] = fn
            
            tx_list.append(df_tx)
            hdr_list.append(df_hdr)
        except Exception as e:
            failures.append({
                "file": fn,
                "error": str(e),
                "traceback": traceback.format_exc()
            })
    all_tx  = pd.concat(tx_list, ignore_index=True) if tx_list else pd.DataFrame()
    all_hdr = pd.concat(hdr_list, ignore_index=True) if hdr_list else pd.DataFrame()
    return all_tx, all_hdr, failures

def validate_bbl(df_tx_all: pd.DataFrame, df_hdr_all: pd.DataFrame) -> pd.DataFrame:
    # STEP 1: ensure numeric
    tx_clean = df_tx_all.copy()
    tx_clean[['debit','credit']] = (
        tx_clean[['debit','credit']]
        .replace('', np.nan)
        .astype(float)
    )

    # STEP 2: sums per file
    sums = (tx_clean
            .groupby('source_file')[['debit','credit']]
            .sum(min_count=1)
            .rename(columns={
                'debit':'sum_debit',
                'credit'   :'sum_credit'
            }))

    # STEP 3: counts per file
    counts = (tx_clean
              .groupby('source_file')[['debit','credit']]
              .count()
              .rename(columns={
                  'debit':'count_debit_tx',
                  'credit'   :'count_credit_tx'
              }))

    # STEP 4: pick header summary columns (drop pages without both)
    # note: we sum the two *_transaction fields into total_txns
    hdr = (df_hdr_all
           .dropna(subset=['total_debit_transaction','total_credit_transaction'])
           .groupby('source_file')
           .agg({
               'total_debit_transaction':'sum',
               'total_debit'            :'sum',
               'total_credit_transaction'   :'sum',
               'total_credit'               :'sum'
           })
          )
    hdr = hdr.rename(columns={
        'total_debit_transaction':'total_debit_txns',
        'total_debit'            :'total_debit',
        'total_credit_transaction'   :'total_credit_txns',
        'total_credit'               :'total_credit'
    })


    # STEP 5: merge & compare
    cmp = (hdr
           .join(sums,   how='left')
           .join(counts, how='left')
           .reset_index()
           .rename(columns={'source_file':'file'}))

    summary = cmp.assign(
        debit_amount_match = lambda d: np.isclose(d['total_debit'], d['sum_debit'], atol=1e-2),
        credit_amount_match    = lambda d: np.isclose(d['total_credit'],    d['sum_credit'],    atol=1e-2),
        transaction_count_debit_match = lambda d: d['total_debit_txns'] == (d['count_debit_tx']),
        transaction_count_credit_match = lambda d: d['total_credit_txns'] == (d['count_credit_tx'])
        
    )[
        ['file',
         'total_debit','sum_debit','debit_amount_match',
         'total_credit',   'sum_credit',   'credit_amount_match',
         'total_credit_txns','total_debit_txns', 'count_debit_tx','count_credit_tx','transaction_count_debit_match','transaction_count_credit_match']
    ]

    return summary

if __name__ == "__main__":
    df_tx_all, df_hdr_all, failures = process_folder(INPUT_FOLDER)
    print(f"Processed transactions: {df_tx_all.shape}, headers: {df_hdr_all.shape}")
    if failures:
        print(f"\n⚠️ {len(failures)} failures; inspect `failures` list.")
    if df_tx_all.empty or df_hdr_all.empty:
        print("No data to validate; exiting.")
        exit(1)
    print(df_tx_all,df_hdr_all)
    summary = validate_bbl(df_tx_all, df_hdr_all)
    print("\n--- Validation Summary per File ---")
    print(summary.to_string(index=False))

    bad = summary.loc[~(summary.debit_amount_match
                        & summary.credit_amount_match
                        & summary.transaction_count_debit_match
                        & summary.transaction_count_credit_match)]
    if not bad.empty:
        bad = bad.assign(
            diff_debit = bad['sum_debit'] - bad['total_debit'],
            diff_credit    = bad['sum_credit']    - bad['total_credit']
        )
        print("\n❌ Files with mismatches:")
        print(bad.to_string(index=False))
    else:
        print("\n✅ All files validated successfully!")


Page 1/5: found 36 date entries
Page 1/5: extracted 36 transactions so far
Page 2/5: found 36 date entries
Page 2/5: extracted 72 transactions so far
Page 3/5: found 36 date entries
Page 3/5: extracted 108 transactions so far
Page 4/5: found 36 date entries
Page 4/5: extracted 144 transactions so far
Page 5/5: found 2 date entries
Page 5/5: extracted 146 transactions so far
Page 1/2: found 36 date entries
Page 1/2: extracted 36 transactions so far
Page 2/2: found 7 date entries
Page 2/2: extracted 43 transactions so far
Page 1/1: found 18 date entries
Page 1/1: extracted 18 transactions so far
Page 1/3: found 36 date entries
Page 1/3: extracted 36 transactions so far
Page 2/3: found 36 date entries
Page 2/3: extracted 72 transactions so far
Page 3/3: found 7 date entries
Page 3/3: extracted 79 transactions so far
Page 1/9: found 36 date entries
Page 1/9: extracted 36 transactions so far
Page 2/9: found 36 date entries
Page 2/9: extracted 72 transactions so far
Page 3/9: found 36 date e

In [2]:
summary

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match
0,108988-02009141-2566_1_BBL.pdf,478572.75,478572.75,True,478619.38,478619.38,True,340.0,1183.0,1183,340,True,True
1,108988-02009467-2566_1_BBL.pdf,707682.26,707682.26,True,706536.75,706536.75,True,148.0,797.0,797,148,True,True
2,108988-02009558-2566_1_BBL.pdf,519756.79,519756.79,True,519700.46,519700.46,True,291.0,392.0,392,291,True,True
3,108988-02009782-2566_3_BBL.pdf,57072.89,57072.89,True,47087.89,47087.89,True,5.0,29.0,29,5,True,True
4,108988-02009839-2566_1_BBL.pdf,393083.29,393083.29,True,393118.90,393118.90,True,140.0,304.0,304,140,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,มีรูปถ่าย.pdf,250976.37,250976.37,True,328275.50,328275.50,True,216.0,209.0,209,216,True,True
285,มีรูปแคป.pdf,257830.26,996966.47,False,257797.43,257797.43,True,19.0,77.0,730,19,False,True
286,มีสแกน.pdf,102382.35,102382.35,True,102324.62,102324.62,True,15.0,102.0,102,15,True,True
287,มีเอกสารอื่นปน4.pdf,515179.94,515179.94,True,514141.68,514141.68,True,181.0,338.0,338,181,True,True


In [3]:
bad

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match,diff_debit,diff_credit
282,มีscbปน.pdf,96212.53,361330.71,False,85172.55,1713216.32,False,35.0,67.0,452,419,False,False,265118.18,1628044.0
283,มีscbปน2.pdf,52595.0,193814.14,False,52595.02,52595.02,True,6.0,7.0,153,6,False,True,141219.14,7.275958e-12
285,มีรูปแคป.pdf,257830.26,996966.47,False,257797.43,257797.43,True,19.0,77.0,730,19,False,True,739136.21,0.0


In [4]:
failures

[]

In [9]:
df_tx_all

Unnamed: 0,page_id,date,time,description,channel,debit,credit,balance,transaction_type,source_file
0,1/5,2023-03-03,,,mPhone,250.0,,2450.3,PMT FOR GOODS,108988-02024502-2566_1_BBL.pdf
1,1/5,2023-03-04,,,mPhone,300.0,,2150.3,PMT. PROMPTPAY,108988-02024502-2566_1_BBL.pdf
2,1/5,2023-03-04,,,mPhone,20.0,,2130.3,TRF. PROMPTPAY,108988-02024502-2566_1_BBL.pdf
3,1/5,2023-03-04,,,mPhone,20.0,,2110.3,PMT. PROMPTPAY,108988-02024502-2566_1_BBL.pdf
4,1/5,2023-03-05,,,mPhone,100.0,,2010.3,TRF. PROMPTPAY,108988-02024502-2566_1_BBL.pdf
...,...,...,...,...,...,...,...,...,...,...
84476,6/6,2023-08-02,,,ATMoth,800.0,,0.5,CASH ATM W/D,108988-02023058-2566_2_BBL.pdf
84477,6/6,2023-08-07,,,mPhone,,2000.0,2000.5,TRF FR OTH BK,108988-02023058-2566_2_BBL.pdf
84478,6/6,2023-08-07,,,mPhone,500.0,,1500.5,TRF TO OTH BK,108988-02023058-2566_2_BBL.pdf
84479,6/6,2023-08-07,,,mPhone,500.0,,1000.5,TRF TO OTH BK,108988-02023058-2566_2_BBL.pdf


In [10]:
df_hdr_all

Unnamed: 0,page_id,account_name,account_number,period,total_debit_transaction,total_debit,total_credit_transaction,total_credit,address,source_file
0,1/5,นาย ธนากร อุนจิตต,087-7-56931-9,01/03/2023 - 16/08/2023,,,,,,108988-02024502-2566_1_BBL.pdf
1,2/5,นาย ธนากร อุนจิตต,087-7-56931-9,01/03/2023 - 16/08/2023,,,,,,108988-02024502-2566_1_BBL.pdf
2,3/5,นาย ธนากร อุนจิตต,087-7-56931-9,01/03/2023 - 16/08/2023,,,,,,108988-02024502-2566_1_BBL.pdf
3,4/5,นาย ธนากร อุนจิตต,087-7-56931-9,01/03/2023 - 16/08/2023,,,,,,108988-02024502-2566_1_BBL.pdf
4,5/5,นาย ธนากร อุนจิตต,087-7-56931-9,01/03/2023 - 16/08/2023,127.0,102104.33,18.0,99404.03,,108988-02024502-2566_1_BBL.pdf
...,...,...,...,...,...,...,...,...,...,...
2504,2/6,นาย พิพัฒพงค กมลผัน,068-8-11675-5,01/05/2023 - 18/08/2023,,,,,,108988-02023058-2566_2_BBL.pdf
2505,3/6,นาย พิพัฒพงค กมลผัน,068-8-11675-5,01/05/2023 - 18/08/2023,,,,,,108988-02023058-2566_2_BBL.pdf
2506,4/6,นาย พิพัฒพงค กมลผัน,068-8-11675-5,01/05/2023 - 18/08/2023,,,,,,108988-02023058-2566_2_BBL.pdf
2507,5/6,นาย พิพัฒพงค กมลผัน,068-8-11675-5,01/05/2023 - 18/08/2023,,,,,,108988-02023058-2566_2_BBL.pdf
