In [None]:
import re
from typing import Optional, List, Dict, Any, Tuple

import pandas as pd
import pdfplumber

import config  # Imports all constants and regex patterns


class SCBwithnoteStatementExtractor:
    """
    SCBwithnoteStatementExtractor encapsulates:
      1. Header extraction (account name, account number, period, totals)
      2. Transaction extraction (date, time, code, channel, debit, credit, balance, description)
      3. Cleanup to coerce numeric strings into floats
    """

    @staticmethod
    def clean_page_id(raw_text: str) -> str:
        """
        Extracts and normalizes page ID from a string of text.
        Returns "n/m" if found, else "".
        """
        found_numbers = re.findall(r"\d+", raw_text)
        if len(found_numbers) >= 2:
            candidate = f"{found_numbers[0]}/{found_numbers[1]}"
            if re.fullmatch(r"\d+/\d+", candidate):
                return candidate
        return ""

    @staticmethod
    def extract_page_id_from_page(page: pdfplumber.page.Page) -> str:
        """
        Crops a small region where the page ID is printed (e.g. "1/7"), then cleans it.
        """
        cropped = page.crop(config.PAGE_ID_CROP_BOX)
        raw_text = cropped.extract_text() or ""
        return SCBwithnoteStatementExtractor.clean_page_id(raw_text.strip())

    @staticmethod
    def find_time_for_row(
        date_word: Dict[str, Any],
        all_words: List[Dict[str, Any]]
    ) -> str:
        """
        Given a dict for a date word, search below it (within 20 pixels)
        for a TIME_REGEX match and return that time string.
        """
        if date_word is None:
            return ""
        date_top = date_word.get("top", 0)
        for word in all_words:
            text = word.get("text", "")
            if config.TIME_REGEX.match(text):
                vertical_offset = word["top"] - date_top
                if 0 < vertical_offset <= 20:
                    return text
        return ""

    @staticmethod
    def extract_transactions_from_pages(
        pages: List[pdfplumber.page.Page]
    ) -> pd.DataFrame:
        """
        Goes through all pages, finds transaction rows, and builds a DataFrame of:
          [page_id, date, time, code, channel, debit, credit, balance, description]
        Any errors on a given page are caught and printed; extraction then continues.
        """
        transaction_records: List[Dict[str, Any]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = SCBwithnoteStatementExtractor.extract_page_id_from_page(page)
                table_region = page.crop(config.TABLE_CROP_BOX)
                all_words = table_region.extract_words(use_text_flow=False)

                # Identify Y‐positions of every word whose text matches a date and is within the date column x‐range
                date_tops = sorted(
                    word["top"]
                    for word in all_words
                    if config.DATE_REGEX.match(word["text"])
                    and config.DATE_COLUMN_X0 <= word["x0"] <= config.DATE_COLUMN_X1
                )
                if not date_tops:
                    # No transactions on this page
                    continue

                # Build vertical intervals for each row
                row_intervals: List[Tuple[float, float]] = []
                for idx, y in enumerate(date_tops):
                    top_bound = y - config.Y_MARGIN
                    if idx + 1 < len(date_tops):
                        bottom_bound = date_tops[idx + 1] - config.Y_MARGIN
                    else:
                        bottom_bound = y + 15  # Enough to catch associated words
                    row_intervals.append((top_bound, bottom_bound))

                # Group words into their respective rows
                rows: List[List[Dict[str, Any]]] = [[] for _ in row_intervals]
                for word in all_words:
                    word_top = word["top"]
                    for row_id, (row_top, row_bottom) in enumerate(row_intervals):
                        if row_top <= word_top < row_bottom:
                            rows[row_id].append(word)
                            break

                # For each row, parse columns
                for row_words in rows:
                    if not row_words:
                        continue

                    # Merge all text in the row to skip footers if found
                    row_text_combined = " ".join(w["text"] for w in row_words)
                    if any(keyword in row_text_combined for keyword in config.FOOTER_KEYWORDS):
                        continue  # skip summary/footer row

                    # Sort by vertical then horizontal so left‐to‐right reading order is preserved
                    row_sorted = sorted(row_words, key=lambda w: (w["top"], w["x0"]))

                    # Find the "date" word and then any time immediately below
                    date_word = next(
                        (
                            w for w in row_sorted
                            if config.DATE_REGEX.match(w["text"])
                            and config.DATE_COLUMN_X0 <= w["x0"] <= config.DATE_COLUMN_X1
                        ),
                        None
                    )
                    date_string = date_word["text"] if date_word else ""
                    time_string = (
                        SCBwithnoteStatementExtractor.find_time_for_row(date_word, all_words) if date_word else ""
                    )

                    # Prepare containers for each column
                    code_tokens: List[str] = []
                    channel_tokens: List[str] = []
                    description_tokens: List[str] = []
                    debit_credit_words: List[Dict[str, Any]] = []
                    balance_words: List[Dict[str, Any]] = []

                    # Classify each word by its x‐position
                    for word in row_sorted:
                        text = word["text"]
                        x0 = word["x0"]

                        # Skip date/time itself
                        if config.DATE_REGEX.match(text) or config.TIME_REGEX.match(text):
                            continue

                        # Monetary values: decide if it's debit/credit vs balance
                        if config.MONEY_REGEX.match(text):
                            if x0 <= config.X_SPLIT_CHANNEL_DEBIT_CREDIT + config.X_TOLERANCE:
                                debit_credit_words.append(word)
                            else:
                                balance_words.append(word)
                            continue

                        # Otherwise, decide if this is code, channel, or description
                        if x0 <= config.X_SPLIT_CODE_CHANNEL + config.X_TOLERANCE:
                            code_tokens.append(text)
                        elif x0 <= config.X_SPLIT_CHANNEL_DEBIT_CREDIT + config.X_TOLERANCE:
                            channel_tokens.append(text)
                        else:
                            description_tokens.append(text)

                    # Combine code + channel fields
                    combined_code_channel = "/".join(code_tokens + channel_tokens)
                    code_value, channel_value = (
                        (combined_code_channel.split("/", 1) + [""])[:2]
                    )

                    # Determine withdrawal and deposit amounts
                    withdrawal_amount: Optional[float] = None
                    deposit_amount: Optional[float] = None
                    for word in debit_credit_words:
                        numeric_value = float(word["text"].replace(",", ""))
                        # If the right edge (x1) is to the left of the withdrawal/deposit split, treat as withdrawal
                        if word["x1"] <= config.X_SPLIT_WITHDRAWAL_DEPOSIT + config.X_TOLERANCE:
                            withdrawal_amount = numeric_value
                        else:
                            deposit_amount = numeric_value

                    # Pick the rightmost monetary word for balance (if present)
                    balance_amount: Optional[float] = None
                    if balance_words:
                        rightmost = max(balance_words, key=lambda w: w["x0"])
                        balance_amount = float(rightmost["text"].replace(",", ""))

                    # Assemble the record dictionary
                    record: Dict[str, Any] = {
                        "page_id": page_id,
                        "date": pd.to_datetime(date_string, format="%d/%m/%y", dayfirst=True, errors="coerce"),
                        "time": time_string,
                        "code": code_value,
                        "channel": channel_value,
                        "debit": withdrawal_amount,
                        "credit": deposit_amount,
                        "balance": balance_amount,
                        "description": " ".join(description_tokens),
                    }

                    # If page_id is blank, make every field blank
                    if page_id == "":
                        record = {key: "" for key in record}

                    transaction_records.append(record)

            except Exception as error:
                print(f"⚠️  Skipping page {page_index} in transaction extraction due to error: {error}")
                continue

        transaction_dataframe = pd.DataFrame(transaction_records)
        return transaction_dataframe

    @staticmethod
    def extract_headers_from_pages(
        pages: List[pdfplumber.page.Page]
    ) -> pd.DataFrame:
        """
        Extracts header information (account_name, account_number, period),
        plus any "Total amount" or "Total items" if a footer exists.
        """
        header_rows_list: List[Dict[str, Any]] = []

        for page_index, page in enumerate(pages, start=1):
            try:
                page_id = SCBwithnoteStatementExtractor.extract_page_id_from_page(page)
                full_page_text = page.extract_text() or ""
                footer_present = any(
                    kw in full_page_text.lower() for kw in config.FOOTER_KEYWORDS_LOWER
                )

                header_data: Dict[str, Any] = {"page_id": page_id}

                # Extract each header field from its crop box
                for field_name, bbox in config.CROP_BOXES.items():
                    cropped_field = page.crop(bbox)
                    raw_field_text = cropped_field.extract_text() or ""
                    header_data[field_name] = raw_field_text.strip().replace("\n", " ")

                # If a footer exists, parse totals from any line starting with "Total amount" or "Total items"
                if footer_present:
                    for line in full_page_text.splitlines():
                        if line.startswith("Total amount"):
                            found_numbers = re.findall(r"[\d,]+(?:\.\d{2})?", line)
                            header_data.update(
                                {
                                    "total_amount_debit": found_numbers[0].replace(",", "") if len(found_numbers) > 0 else None,
                                    "total_amount_credit": found_numbers[1].replace(",", "") if len(found_numbers) > 1 else None,
                                }
                            )
                        elif line.startswith("Total items"):
                            found_integers = re.findall(r"\d+", line)
                            header_data.update(
                                {
                                    "total_items_debit": found_integers[0] if len(found_integers) > 0 else None,
                                    "total_items_credit": found_integers[1] if len(found_integers) > 1 else None,
                                }
                            )

                # If page_id is blank, zero‐out every header field
                if page_id == "":
                    header_data = {key: "" for key in header_data}

                header_rows_list.append(header_data)

            except Exception as error:
                print(f"⚠️  Skipping page {page_index} in header extraction due to error: {error}")
                continue

        headers_dataframe = pd.DataFrame(header_rows_list)
        # Add an empty 'address' column (to match previous structure)
        headers_dataframe["address"] = ""
        return headers_dataframe

    @staticmethod
    def clean_float_column(series: pd.Series) -> pd.Series:
        """
        Strips all non-numeric characters (except the dot) from a string Series,
        then converts to float. Non-convertible strings become NaN.
        """
        stripped = series.astype(str).apply(lambda s: re.sub(r"[^0-9\.]", "", s))
        return pd.to_numeric(stripped, errors="coerce")

    def clean_dataframes(
        self, transaction_dataframe: pd.DataFrame, header_dataframe: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        1. Renames header columns and fills missing values
        2. Filters out invalid header rows (where page_id does not start with a digit)
        3. Renames transaction columns, fills missing, and ensures numeric columns are float dtype
        4. Always use .copy() when slicing
        """
        # ── CLEAN HEADERS ──────────────────────────────────────────────────────
        header_dataframe = header_dataframe.rename(
            columns={
                "total_amount_debit":  "total_debit",
                "total_amount_credit": "total_credit",
                "total_items_debit":   "total_debit_transaction",
                "total_items_credit":  "total_credit_transaction",
            }
        ).fillna("")

        # Keep only rows where page_id starts with a digit, then copy
        header_dataframe = (
            header_dataframe[
                header_dataframe["page_id"].str.match(r"^\d", na=False)
            ]
            .copy()
            .reset_index(drop=True)
        )

        # ── CLEAN TRANSACTIONS ─────────────────────────────────────────────────
        transaction_dataframe = (
            transaction_dataframe.rename(
                columns={"withdrawal": "debit", "deposit": "credit"}
            )
            .fillna("")  # Fill NaN with empty string before coercion
            .assign(transaction_type="")  # Add an extra column if needed later
        )

        # Keep only valid transactions (page_id not blank), then copy
        transaction_dataframe = (
            transaction_dataframe[transaction_dataframe["page_id"] != ""]
            .copy()
        )

        # Convert debit, credit, balance to floats
        columns_to_float_tx = ["debit", "credit", "balance"]
        for column_name in columns_to_float_tx:
            if column_name in transaction_dataframe:
                transaction_dataframe[column_name] = SCBwithnoteStatementExtractor.clean_float_column(
                    transaction_dataframe[column_name]
                )

        # Convert numeric header columns to floats
        header_columns_to_float = [
            "total_debit",
            "total_credit",
            "total_debit_transaction",
            "total_credit_transaction",
        ]
        for column_name in header_columns_to_float:
            if column_name in header_dataframe:
                header_dataframe[column_name] = SCBwithnoteStatementExtractor.clean_float_column(
                    header_dataframe[column_name]
                )

        return header_dataframe, transaction_dataframe

    def run(
        self, pdf_path: Optional[str] = None, password: Optional[str] = None
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Opens the PDF, runs transaction & header extraction, then cleans both DataFrames.
        Returns (cleaned_header_dataframe, cleaned_transaction_dataframe).
        """
        pdf_path = pdf_path 
        password = password 

        with pdfplumber.open(pdf_path, password=password) as pdf:
            page_list = pdf.pages
            raw_transaction_dataframe = SCBwithnoteStatementExtractor.extract_transactions_from_pages(page_list)
            raw_header_dataframe = SCBwithnoteStatementExtractor.extract_headers_from_pages(page_list)

        cleaned_header_dataframe, cleaned_transaction_dataframe = self.clean_dataframes(
            raw_transaction_dataframe, raw_header_dataframe
        )
        return cleaned_header_dataframe, cleaned_transaction_dataframe


def main():
    """
    Standalone entry point:
      1. Instantiates SCBwithnoteStatementExtractor
      2. Processes the configured PDF
      3. Prints header DataFrame and last 10 transactions for debugging
    """
    extractor = SCBwithnoteStatementExtractor()
    headers_df, transactions_df = extractor.run()

    print("=== Header DataFrame ===")
    print(headers_df.to_string(index=False))
    print("\n=== Last 10 Transactions ===")
    print(transactions_df.tail(10).to_string(index=False))

    return headers_df, transactions_df





In [4]:
if __name__ == "__main__":
    headers_df, transactions_df = main()

AttributeError: module 'config' has no attribute 'PDF_PATH'

In [2]:
headers_df

Unnamed: 0,page_id,account_name,account_number,period,total_debit,total_credit,total_debit_transaction,total_credit_transaction,address
0,1/3,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/05/2023-31/05/2023,,,,,
1,2/3,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/05/2023-31/05/2023,,,,,
2,3/3,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/05/2023-31/05/2023,15677.87,24382.0,56.0,7.0,
3,1/4,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/06/2023-30/06/2023,,,,,
4,2/4,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/06/2023-30/06/2023,,,,,
5,3/4,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/06/2023-30/06/2023,,,,,
6,1/6,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/07/2023-31/07/2023,,,,,
7,2/6,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/07/2023-31/07/2023,,,,,
8,3/6,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/07/2023-31/07/2023,,,,,
9,4/6,นางสาว เจนจิรา มงคลไทร,264-427505-1,01/07/2023-31/07/2023,,,,,


In [3]:
transactions_df

Unnamed: 0,page_id,date,time,code,channel,debit,credit,balance,description,transaction_type
0,1/3,2023-05-02,07:34,X2,ENET,30.00,,27.05,DESC :PromptPay x1697 น.ส.กนกวรรณ บุญธวัชสุขเจ...,
1,1/3,2023-05-02,08:51,X1,ENET,,1500.0,1527.05,DESC :กรุงศรีอยุธยา (BAY) /X336035 NOTE :-,
2,1/3,2023-05-02,12:13,X2,ENET,45.00,,1482.05,DESC :PromptPay x1697 น.ส.กนกวรรณ บุญธวัชสุขเจ...,
3,1/3,2023-05-02,17:58,X2,ENET,90.00,,1392.05,DESC :จ่ายบิล SCB มณี SHOP (ร้านหมูสด) NOTE :-,
4,1/3,2023-05-02,18:01,X2,ENET,85.00,,1307.05,DESC :PromptPay x1284 นายภูดิศ ราญมีชัย NOTE :-,
...,...,...,...,...,...,...,...,...,...,...
242,6/6,2023-07-31,18:31,X1,ENET,,2000.0,3146.61,DESC :กสิกรไทย (KBANK) /X751395 NOTE :-,
243,6/6,2023-07-31,18:32,X2,ENET,2010.84,,1135.77,"DESC :จ่ายบิล MONIX CO.,LTD. NOTE :-",
244,6/6,2023-07-31,18:32,X2,ENET,1000.00,,135.77,DESC :โอนไป SCB x5043 นางสาว กนกภรณ์ ยังเพ็ง N...,
245,6/6,2023-07-31,18:38,X1,ENET,,1000.0,1135.77,DESC :รับโอนจาก SCB x5043 นางสาว กนกภรณ์ ยังเพ...,


In [1]:
#!/usr/bin/env python3
import os, traceback
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict, Any, Optional
import pdfplumber
from scb_with_note_extractor import SCBwithnoteStatementExtractor

# — your existing imports & extraction functions here —
# from your_module import extract_header, extract_transactions

INPUT_FOLDER = "/Users/if658228/Downloads/OneDrive_1_5-20-2025/agentic_extraction/Dataset04/SCB/with_note"
PASSWORD: Optional[str] = None

def process_folder(input_folder: str) -> Tuple[pd.DataFrame, pd.DataFrame, List[dict]]:
    tx_list, hdr_list, failures = [], [], []
    for fn in os.listdir(input_folder):
        if not fn.lower().endswith(".pdf"):
            continue
        path = os.path.join(input_folder, fn)
        try:
            with pdfplumber.open(path, None) as pdf:
                pages = pdf.pages
                df_hdr, df_tx = SCBwithnoteStatementExtractor.run(path,None)
                df_hdr["source_file"] = fn
                df_tx ["source_file"] = fn
                tx_list.append(df_tx)
                hdr_list.append(df_hdr)
        except Exception as e:
            failures.append({
                "file": fn,
                "error": str(e),
                "traceback": traceback.format_exc()
            })
    all_tx  = pd.concat(tx_list, ignore_index=True) if tx_list else pd.DataFrame()
    all_hdr = pd.concat(hdr_list, ignore_index=True) if hdr_list else pd.DataFrame()
    return all_tx, all_hdr, failures

def validate_bbl(df_tx_all: pd.DataFrame, df_hdr_all: pd.DataFrame) -> pd.DataFrame:
    # STEP 1: ensure tx columns are numeric
    tx_clean = df_tx_all.copy()
    tx_clean[['debit','credit']] = (
        tx_clean[['debit','credit']]
        .replace('', np.nan)
        .astype(float)
    )

    # STEP 1.5: ensure header columns are numeric
    for col in ['total_debit_transaction',
                'total_debit',
                'total_credit_transaction',
                'total_credit']:
        df_hdr_all[col] = (
            pd.to_numeric(
                df_hdr_all[col].replace('', np.nan).astype(str).str.replace(',', ''),
                errors='coerce'
            )
        )

    # (then proceed with sums/counts as before)
    sums = (
        tx_clean
        .groupby('source_file')[['debit','credit']]
        .sum(min_count=1)
        .rename(columns={'debit':'sum_debit','credit':'sum_credit'})
    )

    counts = (
        tx_clean
        .groupby('source_file')[['debit','credit']]
        .count()
        .rename(columns={'debit':'count_debit_tx','credit':'count_credit_tx'})
    )

    hdr = (
        df_hdr_all
        .dropna(subset=['total_debit_transaction','total_credit_transaction'])
        .groupby('source_file')
        .agg({
            'total_debit_transaction':'sum',
            'total_debit':'sum',
            'total_credit_transaction':'sum',
            'total_credit':'sum'
        })
    ).rename(columns={
        'total_debit_transaction':'total_debit_txns',
        'total_debit':'total_debit',
        'total_credit_transaction':'total_credit_txns',
        'total_credit':'total_credit'
    })

    cmp = (
        hdr
        .join(sums, how='left')
        .join(counts, how='left')
        .reset_index()
        .rename(columns={'source_file':'file'})
    )

    summary = cmp.assign(
        debit_amount_match = lambda d: np.isclose(d['total_debit'], d['sum_debit'], atol=1e-2),
        credit_amount_match = lambda d: np.isclose(d['total_credit'], d['sum_credit'], atol=1e-2),
        transaction_count_debit_match = lambda d: d['total_debit_txns'] == d['count_debit_tx'],
        transaction_count_credit_match = lambda d: d['total_credit_txns'] == d['count_credit_tx']
    )[
        ['file',
         'total_debit','sum_debit','debit_amount_match',
         'total_credit','sum_credit','credit_amount_match',
         'total_credit_txns','total_debit_txns','count_debit_tx','count_credit_tx',
         'transaction_count_debit_match','transaction_count_credit_match']
    ]

    return summary


if __name__ == "__main__":
    df_tx_all, df_hdr_all, failures = process_folder(INPUT_FOLDER)
    print(f"Processed transactions: {df_tx_all.shape}, headers: {df_hdr_all.shape}")
    if failures:
        print(f"\n⚠️ {len(failures)} failures; inspect `failures` list.")
    if df_tx_all.empty or df_hdr_all.empty:
        print("No data to validate; exiting.")
        exit(1)
    print(df_tx_all,df_hdr_all)
    summary = validate_bbl(df_tx_all, df_hdr_all)
    print("\n--- Validation Summary per File ---")
    print(summary.to_string(index=False))

    bad = summary.loc[~(summary.debit_amount_match
                        & summary.credit_amount_match
                        & summary.transaction_count_debit_match
                        & summary.transaction_count_credit_match)]
    if not bad.empty:
        bad = bad.assign(
            diff_debit = bad['sum_debit'] - bad['total_debit'],
            diff_credit    = bad['sum_credit']    - bad['total_credit']
        )
        print("\n❌ Files with mismatches:")
        print(bad.to_string(index=False))
    else:
        print("\n✅ All files validated successfully!")


⚠️  Skipping page 11 in transaction extraction due to error: Bounding box (0, 100, 594, 740) is not fully within parent page bounding box (0.0, 0.0, 841.0, 595.0)
Processed transactions: (68047, 11), headers: (3450, 10)
      page_id                 date   time code channel    debit   credit  \
0        1/20  2023-07-01 00:00:00  08:54   X1    ENET      NaN   2500.0   
1        1/20  2023-07-01 00:00:00  08:59   X2    ENET    500.0      NaN   
2        1/20  2023-07-01 00:00:00  09:01   X2    ENET    500.0      NaN   
3        1/20  2023-07-01 00:00:00  09:40   X2    ENET    500.0      NaN   
4        1/20  2023-07-01 00:00:00  10:25   X1    ENET      NaN    503.0   
...       ...                  ...    ...  ...     ...      ...      ...   
68042     2/2  2023-07-29 00:00:00  19:06   X2    ENET     52.0      NaN   
68043     2/2  2023-07-31 00:00:00  02:37   X1    BCMS      NaN  26587.0   
68044     2/2  2023-07-31 00:00:00  06:29   X2    ENET  26000.0      NaN   
68045     2/2  2023-

In [2]:
summary

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match
0,108988-02009181-2566_1_SCB.pdf,86407.13,86407.13,True,85866.08,85866.08,True,62.0,391.0,391,62,True,True
1,108988-02009188-2566_1_SCB.pdf,6826802.95,6826802.95,True,6936802.95,6936802.95,True,875.0,1001.0,1001,875,True,True
2,108988-02009439-2566_1_SCB.pdf,156056.28,156056.28,True,156626.78,156626.78,True,63.0,141.0,141,63,True,True
3,108988-02009617-2566_1_SCB.pdf,476218.84,476218.84,True,474662.40,474662.40,True,46.0,296.0,296,46,True,True
4,108988-02009692-2566_1_SCB.pdf,104929.21,104929.21,True,104336.17,104336.17,True,85.0,179.0,179,85,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,มีformatอื่นๆ 2.pdf,90793.00,90793.00,True,83855.88,83855.88,True,5.0,29.0,29,5,True,True
166,มีรูปถ่าย.pdf,182047.92,182047.92,True,180756.62,180756.62,True,50.0,146.0,146,50,True,True
167,มีหลายformat.pdf,89998.90,89998.90,True,89998.90,89998.90,True,5.0,35.0,35,5,True,True
168,มีหลายformat55.pdf,428156.17,428156.17,True,428156.17,428156.17,True,34.0,42.0,42,34,True,True


In [3]:
bad

Unnamed: 0,file,total_debit,sum_debit,debit_amount_match,total_credit,sum_credit,credit_amount_match,total_credit_txns,total_debit_txns,count_debit_tx,count_credit_tx,transaction_count_debit_match,transaction_count_credit_match,diff_debit,diff_credit
93,108988-02023748-2566_1_SCB.pdf,0.0,,False,15000.0,15000.0,True,2.0,0.0,0,2,True,True,,0.0
112,108988-02026056-2566_1_SCB.pdf,81803.58,138810.48,False,91149.0,141168.87,False,23.0,159.0,213,34,False,False,57006.9,50019.87
