In [6]:
from typing import Optional

from google.api_core.client_options import ClientOptions
from google.cloud import documentai  # type: ignore
import os

In [7]:
import os

# Set the environment variable
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/Users/mohit/Documents/gen-lang-client-0299904904-20d80b047ea0.json"


# ... rest of your code

In [8]:
# TODO(developer): Uncomment these variables before running the sample.
project_id = "gen-lang-client-0299904904"
location = "us" # Format is "us" or "eu"
processor_id = "bf2685d686b2d8db" # Create processor before running sample
file_path = "statements/test_statement.pdf"
mime_type = "application/pdf" # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types
# field_mask = "text,entities,pages.pageNumber"  # Optional. The fields to return in the Document object.
processor_version_id = "pretrained-bankstatement-v3.0-2022-05-16" # Optional. Processor version to use

In [14]:
def process_document_sample(
    project_id: str,
    location: str,
    processor_id: str,
    file_path: str,
    mime_type: str,
    field_mask: Optional[str] = None,
    processor_version_id: Optional[str] = None,
) -> None:
    # You must set the `api_endpoint` if you use a location other than "us".
    opts = ClientOptions(api_endpoint=f"{location}-documentai.googleapis.com")

    client = documentai.DocumentProcessorServiceClient(client_options=opts)

    if processor_version_id:
        # The full resource name of the processor version, e.g.:
        # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
        name = client.processor_version_path(
            project_id, location, processor_id, processor_version_id
        )
    else:
        # The full resource name of the processor, e.g.:
        # `projects/{project_id}/locations/{location}/processors/{processor_id}`
        name = client.processor_path(project_id, location, processor_id)

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Load binary data
    raw_document = documentai.RawDocument(content=image_content, mime_type=mime_type)

    # For more information: https://cloud.google.com/document-ai/docs/reference/rest/v1/ProcessOptions
    # Optional: Additional configurations for processing.
    process_options = None

    # Configure the process request
    # request = documentai.ProcessRequest(
    #     name=name,
    #     raw_document=raw_document,
    #     # field_mask=field_mask,
    #     process_options=process_options,
    # )
    request_params = {
    "name": name,
    "raw_document": raw_document,
    "field_mask": field_mask,
    }

    if process_options:
        request_params["process_options"] = process_options

    request = documentai.ProcessRequest(**request_params)

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    document = result.document

    # Read the text recognition output from the processor
    print("The document contains the following text:")
    print(document.text)
    return document

In [15]:
result = process_document_sample(
    project_id=project_id,
    location=location,
    processor_id=processor_id,
    file_path=file_path,
    mime_type=mime_type,
    processor_version_id=processor_version_id
    )

The document contains the following text:
Costco Anywhere Visa® Card by Citi
citi COSTCO
WHOLESALE
MOHIT AGGARWAL
Member Since 2014 Account number ending in: 8633
Billing Period: 07/02/25-08/01/25
Billing Inquiries and Customer Service
PO Box 790046 ST. LOUIS, MO 63179-0046
1-855-378-6467, (TTY: 711)
www.citicards.com
AUGUST STATEMENT
Minimum payment due:
New balance as of 08/01/25:
Payment due date:
$41.00
$910.51
08/27/25
Account Summary
Previous balance
$1,041.44
Payments
-$1,041.44
Credits
-$36.28
Purchases
+$946.79
Cash advances
+$0.00
Fees
+$0.00
Interest
+$0.00
New balance
$910.51
Credit Limit
Credit Limit
$7,000
Includes $1,400.00 cash advance limit
Available Credit Limit
$6,089
Includes $1,400 available for cash advance
date listed above, you may have to pay a late fee of up to $41 and your APRS
may be increased up to the Penalty APR of 29.99.
For information about credit counseling services, call 1-877-337-8187 (TTY: 711).
00EE98
citi
4100 3901 2345 6789
L WALKER
VISA
Costco 

In [16]:
# Quick entity exploration
entities_by_type = {}
for entity in result.entities:
    entity_type = entity.type_
    if entity_type not in entities_by_type:
        entities_by_type[entity_type] = []
    entities_by_type[entity_type].append(entity)

print(f"Found {len(result.entities)} entities of {len(entities_by_type)} different types:")
for entity_type, entity_list in entities_by_type.items():
    print(f"  {entity_type}: {len(entity_list)} items")


Found 37 entities of 11 different types:
  client_name: 2 items
  statement_start_date: 2 items
  ending_balance: 2 items
  bank_address: 1 items
  starting_balance: 1 items
  bank_name: 1 items
  account_number: 4 items
  account_type: 1 items
  statement_end_date: 2 items
  statement_date: 1 items
  table_item: 20 items


In [17]:
# Extract transaction data
transactions = []
for entity_type, entities in entities_by_type.items():
    if 'transaction' in entity_type.lower() or 'amount' in entity_type.lower() or 'date' in entity_type.lower():
        print(f"\nExamining {entity_type}:")
        for i, entity in enumerate(entities[:3]):  # Show first 3
            text = entity.mention_text or ""
            print(f"  {i}: '{text}'")

print("\n" + "="*50)
print("Next steps:")
print("1. Look at the entity types above")
print("2. Let me know which ones contain your transaction data") 
print("3. I'll help you write specific parsing code for your document format")


Examining statement_start_date:
  0: '07/02/25-'
  1: '07/03'

Examining statement_end_date:
  0: '08/01/25'
  1: '08/01/25'

Examining statement_date:
  0: '08/01/25'

Next steps:
1. Look at the entity types above
2. Let me know which ones contain your transaction data
3. I'll help you write specific parsing code for your document format


In [18]:
# Quick preview of table items - run this first to see the structure

def quick_preview_table_items(document):
    table_items = [entity for entity in document.entities if entity.type_ == 'table_item']
    
    print(f"Found {len(table_items)} table items:")
    
    for i, item in enumerate(table_items[:5]):  # Show first 5
        # Get the text
        if item.text_anchor and item.text_anchor.text_segments:
            text_parts = []
            for segment in item.text_anchor.text_segments:
                start = int(segment.start_index) if segment.start_index else 0
                end = int(segment.end_index) if segment.end_index else len(document.text)
                text_parts.append(document.text[start:end])
            item_text = "".join(text_parts).strip()
        else:
            item_text = item.mention_text or ""
        
        print(f"\nItem {i}: '{item_text}'")
        
        # Show properties if any
        if item.properties:
            print(f"  Properties:")
            for prop in item.properties:
                if prop.text_anchor and prop.text_anchor.text_segments:
                    prop_parts = []
                    for segment in prop.text_anchor.text_segments:
                        start = int(segment.start_index) if segment.start_index else 0
                        end = int(segment.end_index) if segment.end_index else len(document.text)
                        prop_parts.append(document.text[start:end])
                    prop_text = "".join(prop_parts).strip()
                else:
                    prop_text = prop.mention_text or ""
                
                print(f"    {prop.type_}: '{prop_text}'")



In [19]:
# Run this to see your table structure:
quick_preview_table_items(result)

Found 20 table items:

Item 0: '07/2206/3007/10ELECTRONIC PAYMENT-THANK YOU
HEADWAY
NEW YORK NY
HEADWAY
NEW YORK NY-$1,041.44-$10.00-$10.00'
  Properties:
    table_item/transaction_withdrawal_date: '07/10'
    table_item/transaction_withdrawal_date: '06/30'
    table_item/transaction_withdrawal_date: '07/22'
    table_item/transaction_withdrawal_description: 'ELECTRONIC PAYMENT-THANK YOU
HEADWAY
NEW YORK NY
HEADWAY
NEW YORK NY'
    table_item/transaction_withdrawal: '-$1,041.44'
    table_item/transaction_withdrawal: '-$10.00'
    table_item/transaction_withdrawal: '-$10.00'

Item 1: '07/16-$16.28'
  Properties:
    table_item/transaction_withdrawal_date: '07/16'
    table_item/transaction_withdrawal: '-$16.28'

Item 2: 'MOHIT AGGARWAL
Standard Purchases
07/05 07/05 HEADWAY
HEADWAY.CO NY$10.00'
  Properties:
    table_item/transaction_withdrawal_description: 'MOHIT AGGARWAL
Standard Purchases
07/05 07/05 HEADWAY
HEADWAY.CO NY'
    table_item/transaction_withdrawal: '$10.00'

Item 3: '

In [20]:
# Bank Statement Parser for your specific format

import pandas as pd
from typing import Dict, List, Any
from google.cloud import documentai

def extract_entity_text(entity: documentai.Document.Entity, document_text: str) -> str:
    """Extract text from entity text segments"""
    if not entity.text_anchor or not entity.text_anchor.text_segments:
        return entity.mention_text or ""
    
    text_parts = []
    for segment in entity.text_anchor.text_segments:
        start_index = int(segment.start_index) if segment.start_index else 0
        end_index = int(segment.end_index) if segment.end_index else len(document_text)
        text_parts.append(document_text[start_index:end_index])
    
    return "".join(text_parts).strip()

def parse_bank_statement(document: documentai.Document) -> Dict[str, Any]:
    """Parse bank statement into structured data"""
    
    # Group entities by type
    entities_by_type = {}
    for entity in document.entities:
        entity_type = entity.type_
        if entity_type not in entities_by_type:
            entities_by_type[entity_type] = []
        entities_by_type[entity_type].append(entity)
    
    # Extract basic statement info
    statement_info = {}
    
    # Client name
    if 'client_name' in entities_by_type:
        statement_info['client_name'] = [extract_entity_text(e, document.text) for e in entities_by_type['client_name']]
    
    # Bank info
    if 'bank_name' in entities_by_type:
        statement_info['bank_name'] = extract_entity_text(entities_by_type['bank_name'][0], document.text)
    
    if 'bank_address' in entities_by_type:
        statement_info['bank_address'] = extract_entity_text(entities_by_type['bank_address'][0], document.text)
    
    # Account info
    if 'account_number' in entities_by_type:
        statement_info['account_numbers'] = [extract_entity_text(e, document.text) for e in entities_by_type['account_number']]
    
    if 'account_type' in entities_by_type:
        statement_info['account_type'] = extract_entity_text(entities_by_type['account_type'][0], document.text)
    
    # Date info
    if 'statement_start_date' in entities_by_type:
        statement_info['statement_start_dates'] = [extract_entity_text(e, document.text) for e in entities_by_type['statement_start_date']]
    
    if 'statement_end_date' in entities_by_type:
        statement_info['statement_end_dates'] = [extract_entity_text(e, document.text) for e in entities_by_type['statement_end_date']]
    
    if 'statement_date' in entities_by_type:
        statement_info['statement_date'] = extract_entity_text(entities_by_type['statement_date'][0], document.text)
    
    # Balance info
    if 'starting_balance' in entities_by_type:
        statement_info['starting_balance'] = extract_entity_text(entities_by_type['starting_balance'][0], document.text)
    
    if 'ending_balance' in entities_by_type:
        statement_info['ending_balances'] = [extract_entity_text(e, document.text) for e in entities_by_type['ending_balance']]
    
    return statement_info, entities_by_type

def parse_table_items(entities_by_type: Dict, document_text: str) -> List[Dict[str, Any]]:
    """Parse table_item entities into transaction records"""
    
    if 'table_item' not in entities_by_type:
        return []
    
    transactions = []
    
    print(f"Found {len(entities_by_type['table_item'])} table items")
    
    for i, table_item in enumerate(entities_by_type['table_item']):
        transaction = {
            'item_id': i,
            'raw_text': extract_entity_text(table_item, document_text)
        }
        
        # Extract properties from each table item
        if table_item.properties:
            print(f"\nTable Item {i}:")
            print(f"  Raw text: '{transaction['raw_text']}'")
            print(f"  Properties ({len(table_item.properties)}):")
            
            for prop in table_item.properties:
                prop_type = prop.type_
                prop_value = extract_entity_text(prop, document_text)
                transaction[prop_type] = prop_value
                print(f"    {prop_type}: '{prop_value}'")
        else:
            print(f"Table Item {i}: '{transaction['raw_text']}' (no properties)")
        
        transactions.append(transaction)
    
    return transactions

def analyze_table_structure(entities_by_type: Dict, document_text: str):
    """Analyze the structure of table items to understand the format"""
    
    if 'table_item' not in entities_by_type:
        print("No table_item entities found")
        return
    
    print("=== TABLE STRUCTURE ANALYSIS ===")
    
    # Collect all property types
    all_property_types = set()
    items_with_properties = 0
    
    for table_item in entities_by_type['table_item']:
        if table_item.properties:
            items_with_properties += 1
            for prop in table_item.properties:
                all_property_types.add(prop.type_)
    
    print(f"Total table items: {len(entities_by_type['table_item'])}")
    print(f"Items with properties: {items_with_properties}")
    print(f"Unique property types found: {sorted(all_property_types)}")
    
    # Show a few examples
    print(f"\nFirst 5 table items with their properties:")
    for i, table_item in enumerate(entities_by_type['table_item'][:5]):
        item_text = extract_entity_text(table_item, document_text)
        print(f"\nItem {i}: '{item_text}'")
        
        if table_item.properties:
            for prop in table_item.properties:
                prop_type = prop.type_
                prop_value = extract_entity_text(prop, document_text)
                print(f"  {prop_type}: '{prop_value}'")
        else:
            print("  (no properties)")

# Main execution code
def analyze_and_extract_transactions(document):
    """Main function to analyze and extract transactions"""
    
    print("=== BANK STATEMENT ANALYSIS ===")
    
    # Parse basic statement info
    statement_info, entities_by_type = parse_bank_statement(document)
    
    print("\n=== STATEMENT INFO ===")
    for key, value in statement_info.items():
        print(f"{key}: {value}")
    
    # Analyze table structure
    print(f"\n=== TABLE ANALYSIS ===")
    analyze_table_structure(entities_by_type, document.text)
    
    # Extract transactions
    print(f"\n=== EXTRACTING TRANSACTIONS ===")
    transactions = parse_table_items(entities_by_type, document.text)
    
    if transactions:
        # Convert to DataFrame
        df = pd.DataFrame(transactions)
        
        print(f"\n=== TRANSACTION SUMMARY ===")
        print(f"Total transactions found: {len(transactions)}")
        print(f"Columns: {list(df.columns)}")
        
        # Display the data
        print(f"\n=== TRANSACTION DATA ===")
        display(df)
        
        # Save to CSV
        df.to_csv("bank_transactions.csv", index=False)
        print(f"\n💾 Transactions saved to 'bank_transactions.csv'")
        
        return df, statement_info
    else:
        print("❌ No transactions extracted")
        return None, statement_info

# Run the analysis (assuming you have 'document' from your previous processing)
# Uncomment the line below to run:
# df, info = analyze_and_extract_transactions(document)

In [21]:
df, info = analyze_and_extract_transactions(result)

=== BANK STATEMENT ANALYSIS ===

=== STATEMENT INFO ===
client_name: ['HIMANI SOOD', 'HIMANI SOOD']
bank_name: MOHIT AGGARWAL
bank_address: PO Box 790046 ST. LOUIS, MO 63179-0046
account_numbers: ['8633', '4100 3901 2345 6789', '8633', '1325']
account_type: Standard Purchases
statement_start_dates: ['07/02/25-', '07/03']
statement_end_dates: ['08/01/25', '08/01/25']
statement_date: 08/01/25
starting_balance: $1,041.44
ending_balances: ['$451.75', '$495.04']

=== TABLE ANALYSIS ===
=== TABLE STRUCTURE ANALYSIS ===
Total table items: 20
Items with properties: 20
Unique property types found: ['table_item/transaction_deposit', 'table_item/transaction_deposit_date', 'table_item/transaction_deposit_description', 'table_item/transaction_withdrawal', 'table_item/transaction_withdrawal_date', 'table_item/transaction_withdrawal_description']

First 5 table items with their properties:

Item 0: '07/2206/3007/10ELECTRONIC PAYMENT-THANK YOU
HEADWAY
NEW YORK NY
HEADWAY
NEW YORK NY-$1,041.44-$10.00-$

Unnamed: 0,item_id,raw_text,table_item/transaction_withdrawal_date,table_item/transaction_withdrawal_description,table_item/transaction_withdrawal,table_item/transaction_deposit_description,table_item/transaction_deposit_date,table_item/transaction_deposit
0,0,07/2206/3007/10ELECTRONIC PAYMENT-THANK YOU\nH...,07/22,ELECTRONIC PAYMENT-THANK YOU\nHEADWAY\nNEW YOR...,-$10.00,,,
1,1,07/16-$16.28,07/16,,-$16.28,,,
2,2,MOHIT AGGARWAL\nStandard Purchases\n07/05 07/0...,,MOHIT AGGARWAL\nStandard Purchases\n07/05 07/0...,$10.00,,,
3,3,COSTCO WHSE #0678 RANCHO CUCAMOCA$312.41,,COSTCO WHSE #0678 RANCHO CUCAMOCA,$312.41,COSTCO WHSE #0678 RANCHO CUCAMOCA,,
4,4,07/12TRADER JOE S #217 RANCHO CUCAMOCA\nSQ *HA...,,TRADER JOE S #217 RANCHO CUCAMOCA\nSQ *HANDEL'...,$66.64,TRADER JOE S #217 RANCHO CUCAMOCA\nSQ *HANDEL'...,07/12,
5,5,07/12$13.50,,,$13.50,,07/12,
6,6,CucamoCA,,CucamoCA,,CucamoCA,,
7,7,07/13$10.00HEADWAY\nHEADWAY.CO NY,07/13,HEADWAY,$10.00,HEADWAY\nHEADWAY.CO NY,,
8,8,07/13$1.995 on gas at CostcoDESI BAZAAR CASH A...,07/13,5 on gas at Costco,$1.99,5 on gas at Costco,,
9,9,HOMEGOODS 275 RCH CUCAMONGACA07/13$18.31,07/13,HOMEGOODS 275 RCH CUCAMONGACA,$18.31,,,



💾 Transactions saved to 'bank_transactions.csv'
