In [6]:
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore
import os

In [7]:
import os

# Set the environment variable
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/mohit/Documents/gen-lang-client-0299904904-20d80b047ea0.json"


# ... rest of your code

In [41]:
# TODO(developer): Uncomment these variables before running the sample.
project_id = "gen-lang-client-0299904904"
location = "us" # Format is "us" or "eu"
processor_id = "bf2685d686b2d8db" # Create processor before running sample
file_path = "statements/test_statement.pdf"
mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
# field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
processor_version_id = "pretrained-bankstatement-v3.0-2022-05-16" # Optional. Processor version to use

In [None]:
def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # `projects/{project_id}/locations/{location}/processors/{processor_id}`
        name = client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    # For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
    # Optional: Additional configurations for processing.
    process_options = None

    request_params = {
    "name": name,
    "raw_document": raw_document,
    "field_mask": field_mask,
    }

    if process_options:
        request_params["process_options"] = process_options

    request = documentai.ProcessRequest(**request_params)

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document

    # Read the text recognition output from the processor
    print("The document contains the following text:")
    print(document.text)
    return document

In [43]:
result = process_document_sample(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    file_path=file_path,
    mime_type=mime_type,
    processor_version_id=processor_version_id
    )

The document contains the following text:
Costco Anywhere Visa¬Æ Card by Citi
citi COSTCO
WHOLESALE
MOHIT AGGARWAL
Member Since 2014 Account number ending in: 8633
Billing Period: 07/02/25-08/01/25
Billing Inquiries and Customer Service
PO Box 790046 ST. LOUIS, MO 63179-0046
1-855-378-6467, (TTY: 711)
www.citicards.com
AUGUST STATEMENT
Minimum payment due:
New balance as of 08/01/25:
Payment due date:
$41.00
$910.51
08/27/25
Account Summary
Previous balance
$1,041.44
Payments
-$1,041.44
Credits
-$36.28
Purchases
+$946.79
Cash advances
+$0.00
Fees
+$0.00
Interest
+$0.00
New balance
$910.51
Credit Limit
Credit Limit
$7,000
Includes $1,400.00 cash advance limit
Available Credit Limit
$6,089
Includes $1,400 available for cash advance
date listed above, you may have to pay a late fee of up to $41 and your APRS
may be increased up to the Penalty APR of 29.99.
For information about credit counseling services, call 1-877-337-8187 (TTY: 711).
00EE98
citi
4100 3901 2345 6789
L WALKER
VISA
Costco

In [60]:
# Bank Statement Parser for your specific format

import pandas as pd
from typing import Dict, List, Any
from google.cloud import documentai

def extract_entity_text(entity: documentai.Document.Entity, document_text: str) -> str:
    """Extract text from entity text segments"""
    if not entity.text_anchor or not entity.text_anchor.text_segments:
        return entity.mention_text or ""
    
    text_parts = []
    for segment in entity.text_anchor.text_segments:
        start_index = int(segment.start_index) if segment.start_index else 0
        end_index = int(segment.end_index) if segment.end_index else len(document_text)
        text_parts.append(document_text[start_index:end_index])
    
    return "".join(text_parts).strip()

def parse_bank_statement(document: documentai.Document) -> Dict[str, Any]:
    """Parse bank statement into structured data with corrected logic."""
    
    # Group entities by type
    entities_by_type = {}
    for entity in document.entities:
        entity_type = entity.type_
        if entity_type not in entities_by_type:
            entities_by_type[entity_type] = []
        entities_by_type[entity_type].append(entity)
    
    statement_info = {}

    # --- CORRECTED & COMBINED HEURISTIC FOR BANK NAME ---
    KNOWN_BANK_NAMES = ["citi", "capital one", "chase", "bank of america", "discover", "wells fargo", "american express"]
    
    bank_name_entity = entities_by_type.get('bank_name', [None])[0]
    
    # First, check if the model found a bank_name entity at all
    if bank_name_entity:
        extracted_bank_name = extract_entity_text(bank_name_entity, document.text)
        
        # Next, check if the extracted name seems incorrect (e.g., it's your name)
        if "MOHIT AGGARWAL" in extracted_bank_name:
            # If it's incorrect, search the full document text for a known bank name
            statement_info['bank_name'] = 'Not Found'  # Default
            for b_name in KNOWN_BANK_NAMES:
                if b_name in document.text.lower():
                    statement_info['bank_name'] = b_name.title()
                    break
        else:
            # The extracted name seems valid, so we'll use it
            statement_info['bank_name'] = extracted_bank_name
    else:
        # The model didn't find a bank_name, so fall back to searching the text
        statement_info['bank_name'] = 'Not Found'  # Default
        for b_name in KNOWN_BANK_NAMES:
            if b_name in document.text.lower():
                statement_info['bank_name'] = b_name.title()
                break

    # Hardcoding names is a good practical solution for a personal script
    statement_info['primary_client_name'] = "MOHIT AGGARWAL"
    statement_info['all_cardholders'] = ["MOHIT AGGARWAL", "HIMANI SOOD"]
    
    return statement_info, entities_by_type



def parse_table_items(entities_by_type: Dict, document_text: str, all_cardholders: List[str]) -> List[Dict[str, Any]]:
    """
    Parse table_item entities into transaction records, associating each with the correct cardholder.
    """
    if 'table_item' not in entities_by_type:
        return []
    
    transactions = []
    current_cardholder = "Unknown"  # Start with a default value
    
    print(f"Found {len(entities_by_type['table_item'])} table items")
    
    for i, table_item in enumerate(entities_by_type['table_item']):
        raw_text = extract_entity_text(table_item, document_text)
        
        # Check if the raw text indicates a change in the cardholder context.
        # The document processor often groups section headers with the first transaction.
        for name in all_cardholders:
            if name in raw_text:
                current_cardholder = name
                break
        
        transaction = {
            'item_id': i,
            'cardholder': current_cardholder,  # Add the tracked cardholder to the record
            'raw_text': raw_text
        }
        
        # Extract properties from each table item
        if table_item.properties:
            for prop in table_item.properties:
                prop_type = prop.type_
                prop_value = extract_entity_text(prop, document_text)
                transaction[prop_type] = prop_value
        
        transactions.append(transaction)
    
    return transactions

# Main execution code
def analyze_and_extract_transactions(document):
    """Main function to analyze and extract transactions"""
    
    print("=== BANK STATEMENT ANALYSIS ===")
    
    # Parse basic statement info
    statement_info, entities_by_type = parse_bank_statement(document)
    
    print("\n=== STATEMENT INFO ===")
    for key, value in statement_info.items():
        print(f"{key}: {value}")
        
    # Extract transactions, passing in the list of known cardholders
    print(f"\n=== EXTRACTING TRANSACTIONS ===")
    transactions = parse_table_items(entities_by_type, document.text, statement_info['all_cardholders'])
    
    if transactions:
        # Convert to DataFrame
        df = pd.DataFrame(transactions)
        
        # Add the bank_name from the statement_info to every transaction row
        df['bank_name'] = statement_info.get('bank_name', 'N/A')
        
        # Reorder columns to bring important info to the front
        desired_order = ['bank_name', 'cardholder', 'item_id'] + [col for col in df.columns if col not in ['bank_name', 'cardholder', 'item_id']]
        df = df[desired_order]

        print(f"\n=== TRANSACTION SUMMARY ===")
        print(f"Total transactions found: {len(df)}")
        print(f"Columns: {list(df.columns)}")
        
        # Display the data
        print(f"\n=== TRANSACTION DATA === ")
        display(df)
        
        # Save to CSV
        df.to_csv("bank_transactions_updated.csv", index=False)
        print(f"\nüíæ Transactions saved to 'bank_transactions_updated.csv'")
        
        return df, statement_info
    else:
        print("‚ùå No transactions extracted")
        return None, statement_info

# To run the updated code, ensure you have the 'result' object from the API call
# and then execute the following line:
#
# df, info = analyze_and_extract_transactions(result.document)

In [61]:
df4, info4 = analyze_and_extract_transactions(result)

=== BANK STATEMENT ANALYSIS ===

=== STATEMENT INFO ===
bank_name: Citi
primary_client_name: MOHIT AGGARWAL
all_cardholders: ['MOHIT AGGARWAL', 'HIMANI SOOD']

=== EXTRACTING TRANSACTIONS ===
Found 20 table items

=== TRANSACTION SUMMARY ===
Total transactions found: 20
Columns: ['bank_name', 'cardholder', 'item_id', 'raw_text', 'table_item/transaction_withdrawal_date', 'table_item/transaction_withdrawal_description', 'table_item/transaction_withdrawal', 'table_item/transaction_deposit_description', 'table_item/transaction_deposit_date', 'table_item/transaction_deposit']

=== TRANSACTION DATA === 


Unnamed: 0,bank_name,cardholder,item_id,raw_text,table_item/transaction_withdrawal_date,table_item/transaction_withdrawal_description,table_item/transaction_withdrawal,table_item/transaction_deposit_description,table_item/transaction_deposit_date,table_item/transaction_deposit
0,Citi,Unknown,0,07/2206/3007/10ELECTRONIC PAYMENT-THANK YOU\nH...,07/22,ELECTRONIC PAYMENT-THANK YOU\nHEADWAY\nNEW YOR...,-$10.00,,,
1,Citi,Unknown,1,07/16-$16.28,07/16,,-$16.28,,,
2,Citi,MOHIT AGGARWAL,2,MOHIT AGGARWAL\nStandard Purchases\n07/05 07/0...,,MOHIT AGGARWAL\nStandard Purchases\n07/05 07/0...,$10.00,,,
3,Citi,MOHIT AGGARWAL,3,COSTCO WHSE #0678 RANCHO CUCAMOCA$312.41,,COSTCO WHSE #0678 RANCHO CUCAMOCA,$312.41,COSTCO WHSE #0678 RANCHO CUCAMOCA,,
4,Citi,MOHIT AGGARWAL,4,07/12TRADER JOE S #217 RANCHO CUCAMOCA\nSQ *HA...,,TRADER JOE S #217 RANCHO CUCAMOCA\nSQ *HANDEL'...,$66.64,TRADER JOE S #217 RANCHO CUCAMOCA\nSQ *HANDEL'...,07/12,
5,Citi,MOHIT AGGARWAL,5,07/12$13.50,,,$13.50,,07/12,
6,Citi,MOHIT AGGARWAL,6,CucamoCA,,CucamoCA,,CucamoCA,,
7,Citi,MOHIT AGGARWAL,7,07/13$10.00HEADWAY\nHEADWAY.CO NY,07/13,HEADWAY,$10.00,HEADWAY\nHEADWAY.CO NY,,
8,Citi,MOHIT AGGARWAL,8,07/13$1.995 on gas at CostcoDESI BAZAAR CASH A...,07/13,5 on gas at Costco,$1.99,5 on gas at Costco,,
9,Citi,MOHIT AGGARWAL,9,HOMEGOODS 275 RCH CUCAMONGACA07/13$18.31,07/13,HOMEGOODS 275 RCH CUCAMONGACA,$18.31,,,



üíæ Transactions saved to 'bank_transactions_updated.csv'


In [68]:
import pandas as pd
import numpy as np

def preprocess_transactions(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocesses a transaction DataFrame by cleaning, coalescing, renaming,
    and filtering out records with no amount.

    Args:
        df: The input DataFrame with raw transaction data.

    Returns:
        A cleaned DataFrame with a simplified schema.
    """
    # Create a copy to avoid modifying the original DataFrame
    processed_df = df.copy()

    # 1. Remove 'item_id' and 'raw_text' columns
    processed_df.drop(columns=['item_id', 'raw_text'], inplace=True)

    # 2. Coalesce description columns
    processed_df['table_item/transaction_withdrawal_description'] = processed_df['table_item/transaction_withdrawal_description'].fillna(
        processed_df['table_item/transaction_deposit_description']
    )

    # 3. Coalesce amount columns
    processed_df['table_item/transaction_withdrawal'] = processed_df['table_item/transaction_withdrawal'].fillna(
        processed_df['table_item/transaction_deposit']
    )
    
    # Coalesce date columns for completeness
    processed_df['table_item/transaction_withdrawal_date'] = processed_df['table_item/transaction_withdrawal_date'].fillna(
        processed_df['table_item/transaction_deposit_date']
    )

    # 4. Rename the primary columns
    rename_map = {
        'table_item/transaction_withdrawal_date': 'transaction_date',
        'table_item/transaction_withdrawal_description': 'description',
        'table_item/transaction_withdrawal': 'amount'
    }
    processed_df.rename(columns=rename_map, inplace=True)

    # 5. Drop the now-redundant original deposit columns
    processed_df.drop(columns=[
        'table_item/transaction_deposit_date',
        'table_item/transaction_deposit_description',
        'table_item/transaction_deposit'
    ], inplace=True)

    # 6. (NEW) Drop records where the final 'amount' is missing
    processed_df.dropna(subset=['amount'], inplace=True)

    # 7. (NEW) Drop records where the amount is $0.00
    zero_values = ['$0.00', '+$0.00']
    processed_df = processed_df[~processed_df['amount'].isin(zero_values)]

    return processed_df

In [69]:
cleaned_df3 = preprocess_transactions(df4)

In [70]:
cleaned_df3

Unnamed: 0,bank_name,cardholder,transaction_date,description,amount
0,Citi,Unknown,07/22,ELECTRONIC PAYMENT-THANK YOU\nHEADWAY\nNEW YOR...,-$10.00
1,Citi,Unknown,07/16,,-$16.28
2,Citi,MOHIT AGGARWAL,,MOHIT AGGARWAL\nStandard Purchases\n07/05 07/0...,$10.00
3,Citi,MOHIT AGGARWAL,,COSTCO WHSE #0678 RANCHO CUCAMOCA,$312.41
4,Citi,MOHIT AGGARWAL,07/12,TRADER JOE S #217 RANCHO CUCAMOCA\nSQ *HANDEL'...,$66.64
5,Citi,MOHIT AGGARWAL,07/12,,$13.50
7,Citi,MOHIT AGGARWAL,07/13,HEADWAY,$10.00
8,Citi,MOHIT AGGARWAL,07/13,5 on gas at Costco,$1.99
9,Citi,MOHIT AGGARWAL,07/13,HOMEGOODS 275 RCH CUCAMONGACA,$18.31
10,Citi,MOHIT AGGARWAL,07/14,ALDI 79061 FONTANA CA,$8.91


In [52]:
cleaned_df3.to_csv('cleaned_citi.csv')