In [6]:
import os
import uuid
import re
import pandas as pd
from typing import Dict, Any, Optional, List
import logging
from pdf2image import convert_from_path
import pytesseract
from google.cloud import documentai
from google.oauth2 import service_account

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define deduction policies
deduction_policies = {
    "deductions": [
        {"category": "Home Office", "rate": 0.30, "max_limit": 5000},
        {"category": "Software & Subscriptions", "rate": 0.50, "max_limit": 3000},
        {"category": "Freelancer Platform Fees", "rate": 0.25, "max_limit": 2000},
        {"category": "Work Equipment", "rate": 0.50, "max_limit": 5000},
        {"category": "Internet & Phone", "rate": 0.40, "max_limit": 2500},
        {"category": "Education & Training", "rate": 1.0, "max_limit": 5000},
        {"category": "Business Travel", "rate": 1.0, "max_limit": 10000},
        {"category": "Client Entertainment", "rate": 0.20, "max_limit": 1500},
        {"category": "Office Supplies", "rate": 1.0, "max_limit": 5000},
        {"category": "Software & Tools", "rate": 1.0, "max_limit": 10000},
        {"category": "Meals & Entertainment", "rate": 0.5, "max_limit": 2000},
        {"category": "Rent & Utilities", "rate": 1.0, "max_limit": 15000},
        {"category": "Marketing & Advertising", "rate": 1.0, "max_limit": 10000},
        {"category": "Transportation", "rate": 1.0, "max_limit": 5000},
        {"category": "Healthcare", "rate": 0.0, "max_limit": 0},
        {"category": "Personal Shopping", "rate": 0.0, "max_limit": 0},
        {"category": "Groceries", "rate": 0.0, "max_limit": 0}
    ]
}

class ReceiptProcessor:
    def __init__(self, use_google_cloud: bool = False, credentials_path: Optional[str] = None, 
                 project_id: Optional[str] = None, processor_id: Optional[str] = None, 
                 location: str = "us"):

        self.use_google_cloud = use_google_cloud
        
        if use_google_cloud:
            if not all([credentials_path, project_id, processor_id]):
                raise ValueError("To use Google Cloud, you must provide credentials_path, project_id, and processor_id")
            
            self.project_id = project_id
            self.processor_id = processor_id
            self.location = location
            self.processor_name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
            
            try:
                self.credentials = service_account.Credentials.from_service_account_file(credentials_path)
                self.client = documentai.DocumentProcessorServiceClient(credentials=self.credentials)
                logging.info("Successfully connected to Google Cloud Document AI")
            except Exception as e:
                logging.error(f"Failed to initialize Google Cloud Document AI: {e}")
                raise
    
    def extract_text_from_pdf(self, pdf_path: str) -> str:
  
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
        
        logging.info(f"Extracting text from {pdf_path}")
        
        try:
            if self.use_google_cloud:
                return self._extract_text_using_google(pdf_path)
            else:
                return self._extract_text_locally(pdf_path)
        except Exception as e:
            logging.error(f"Error extracting text from PDF: {e}")
            return ""
    
    def _extract_text_using_google(self, pdf_path: str) -> str:
        """Use Google Cloud Document AI to extract text from PDF."""
        try:
            with open(pdf_path, "rb") as pdf_file:
                pdf_content = pdf_file.read()
            
            raw_document = documentai.RawDocument(content=pdf_content, mime_type="application/pdf")
            request = documentai.ProcessRequest(name=self.processor_name, raw_document=raw_document)
            result = self.client.process_document(request=request)
            
            return result.document.text if result.document.text else ""
        except Exception as e:
            logging.error(f"Google Cloud Document AI processing failed: {e}")
            raise
    
    def _extract_text_locally(self, pdf_path: str) -> str:
        """Use local tools (pdf2image + tesseract) to extract text from PDF."""
        try:
            images = convert_from_path(pdf_path)
            text = ""
            
            for i, image in enumerate(images):
                logging.info(f"Processing page {i+1} of {len(images)}")
                text += pytesseract.image_to_string(image)
                text += "\n\n"
            
            return text
        except Exception as e:
            logging.error(f"Local PDF processing failed: {e}")
            raise
    
    def parse_transaction_data(self, text: str, user_id: int = 1) -> Dict[str, Any]:
        logging.info("Parsing transaction data from extracted text")
        data = {
            "transaction_id": str(uuid.uuid4()),
            "user_id": user_id,
            "currency": "USD",  # Default
            "description": "",
            "category": "Other",
            "deduction_rate": 0.0,
            "max_limit": 0,
            "tax_deductible": False
        }
        
        if not text:
            logging.warning("No text provided for parsing")
            return data
        
        # Extract amount - look for dollar amounts
        amount_patterns = [
            r'(?:total|amount|price|due|payment)\s*(?::|is|of)?\s*\$?\s*(\d+(?:,\d+)*\.\d{2})',  # Total: $123.45
            r'\$\s*(\d+(?:,\d+)*\.\d{2})',  # $123.45
            r'(\d+(?:,\d+)*\.\d{2})\s*(?:usd|dollars|eur|euro)'  # 123.45 USD
        ]
        
        for pattern in amount_patterns:
            amount_match = re.search(pattern, text, re.IGNORECASE)
            if amount_match:
                # Remove commas and convert to float
                amount_str = amount_match.group(1).replace(',', '')
                data["amount"] = float(amount_str)
                break
        

        date_patterns = [
            r'\b(\d{4}-\d{2}-\d{2})\b',  # YYYY-MM-DD
            r'\b(\d{2}/\d{2}/\d{4})\b',  # MM/DD/YYYY
            r'\b(\d{2}-\d{2}-\d{4})\b',  # MM-DD-YYYY
            r'\b(\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4})\b' 
        ]
        
        for pattern in date_patterns:
            date_match = re.search(pattern, text, re.IGNORECASE)
            if date_match:
                data["date"] = date_match.group(1)
                break
        
        # Extract payment method
        payment_methods = [
            'Visa', 'MasterCard', 'Amex', 'American Express', 'Discover', 'Cash', 
            'Check', 'PayPal', 'Venmo', 'Apple Pay', 'Google Pay', 'Bank Transfer'
        ]
        
        payment_pattern = '|'.join(payment_methods)
        payment_match = re.search(f'\\b({payment_pattern})\\b', text, re.IGNORECASE)
        if payment_match:
            data["payment_method"] = payment_match.group(1)
        else:
            data["payment_method"] = "Unknown"
        
        # Extract merchant name
        merchant_patterns = [
            r'^([A-Z\s]+)[\n\r]',  # First line in all caps
            r'(?:merchant|vendor|store|shop|business):\s*([^\n\r]+)',  # Labeled as merchant
            r'(?:receipt from|purchased at|shop at):\s*([^\n\r]+)'  # Context clues
        ]
        
        for pattern in merchant_patterns:
            merchant_match = re.search(pattern, text, re.IGNORECASE)
            if merchant_match:
                data["merchant"] = merchant_match.group(1).strip()
                break
        
        if "merchant" not in data or not data["merchant"]:
            # Fallback: use first non-empty line
            lines = [line.strip() for line in text.split('\n') if line.strip()]
            if lines:
                data["merchant"] = lines[0]
            else:
                data["merchant"] = "Unknown"
        
        # Extract description - use first few lines
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        if len(lines) > 1:
            data["description"] = ' '.join(lines[0:3])
        
        # Extract category based on keywords
        text_lower = text.lower()
        for policy in deduction_policies["deductions"]:
            category = policy["category"]
            keywords = self._get_category_keywords(category)
            
            for keyword in keywords:
                if keyword.lower() in text_lower:
                    data["category"] = category
                    data["deduction_rate"] = policy["rate"]
                    data["max_limit"] = policy["max_limit"]
                    data["tax_deductible"] = policy["rate"] > 0
                    break
            
            if data["category"] != "Other":
                break
        
        # Extract currency
        currency_match = re.search(r'\b(USD|EUR|GBP|CAD|AUD|JPY)\b', text, re.IGNORECASE)
        if currency_match:
            data["currency"] = currency_match.group(1).upper()
        
        # Extract country
        countries = ['USA', 'United States', 'Canada', 'UK', 'United Kingdom', 
                    'Germany', 'France', 'Italy', 'Spain', 'Japan', 'China']
        country_pattern = '|'.join(countries)
        country_match = re.search(f'\\b({country_pattern})\\b', text, re.IGNORECASE)
        if country_match:
            data["country"] = country_match.group(1)
        else:
            data["country"] = "Unknown"
        
        return data
    
    def _get_category_keywords(self, category: str) -> List[str]:
        """Get keywords associated with a specific expense category."""
        category_keywords = {
            "Home Office": ["home office", "desk", "chair", "office furniture", "filing cabinet"],
            "Software & Subscriptions": ["software", "subscription", "license", "SaaS", "app", "service"],
            "Freelancer Platform Fees": ["upwork", "fiverr", "freelancer", "platform fee", "service fee"],
            "Work Equipment": ["laptop", "computer", "monitor", "keyboard", "mouse", "printer", "scanner"],
            "Internet & Phone": ["internet", "phone", "mobile", "data plan", "wifi", "broadband"],
            "Education & Training": ["course", "training", "workshop", "seminar", "conference", "book", "education"],
            "Business Travel": ["flight", "hotel", "accommodation", "taxi", "uber", "airbnb", "travel"],
            "Client Entertainment": ["restaurant", "meal", "entertainment", "client meeting"],
            "Office Supplies": ["pen", "paper", "ink", "toner", "stapler", "notebook", "supplies"],
            "Software & Tools": ["software", "tools", "application", "app", "program"],
            "Meals & Entertainment": ["meal", "lunch", "dinner", "restaurant", "food", "coffee"],
            "Rent & Utilities": ["rent", "lease", "utility", "electricity", "water", "gas", "heating"],
            "Marketing & Advertising": ["marketing", "advertising", "promotion", "ad", "campaign"],
            "Transportation": ["transportation", "bus", "train", "subway", "metro", "fuel", "gas", "parking"],
            "Healthcare": ["healthcare", "medical", "doctor", "hospital", "prescription", "medicine"],
            "Personal Shopping": ["clothing", "shoes", "accessory", "gift", "personal"],
            "Groceries": ["grocery", "supermarket", "food store", "produce"]
        }
        
        return category_keywords.get(category, [category.lower()])
    
    def process_receipt(self, pdf_path: str, user_id: int = 1) -> Dict[str, Any]:

        extracted_text = self.extract_text_from_pdf(pdf_path)
        if not extracted_text:
            logging.warning(f"No text extracted from {pdf_path}")
            return {}
            
        transaction_data = self.parse_transaction_data(extracted_text, user_id)
        return transaction_data
    
    def process_multiple_receipts(self, pdf_dir: str, user_id: int = 1) -> pd.DataFrame:
    
        transactions = []
        
        if not os.path.isdir(pdf_dir):
            raise NotADirectoryError(f"Directory not found: {pdf_dir}")
        
        pdf_files = [f for f in os.listdir(pdf_dir) if f.lower().endswith('.pdf')]
        logging.info(f"Found {len(pdf_files)} PDF files in {pdf_dir}")
        
        for pdf_file in pdf_files:
            pdf_path = os.path.join(pdf_dir, pdf_file)
            logging.info(f"Processing {pdf_path}")
            
            try:
                transaction = self.process_receipt(pdf_path, user_id)
                if transaction:
                    transaction['filename'] = pdf_file
                    transactions.append(transaction)
            except Exception as e:
                logging.error(f"Error processing {pdf_path}: {e}")
        
        if not transactions:
            logging.warning("No transactions processed successfully")
            return pd.DataFrame()
            
        return pd.DataFrame(transactions)


# Example usage
if __name__ == "__main__":
    # 1. Using local processing (no Google Cloud)
    processor = ReceiptProcessor(use_google_cloud=False)
    
    #Process a single receipt
    transaction = processor.process_receipt("../../data/forms/1040-NR.pdf")
    print(transaction)
    
    # Process multiple receipts
    
    # 2. Using Google Cloud Document AI (uncomment and provide your details)
    processor = ReceiptProcessor(
        use_google_cloud=True,
        credentials_path="../../documentai.json",
        project_id="ai-finance-project-5e11b",
        processor_id="bf8ffc1bef694966"
     )
    transaction = processor.process_receipt("../../data/forms/1040-NR.pdf")
    print(transaction)

2025-02-28 01:54:48,567 - INFO - Extracting text from ../../data/forms/1040-NR.pdf
2025-02-28 01:54:48,573 - ERROR - Local PDF processing failed: Unable to get page count. Is poppler installed and in PATH?
2025-02-28 01:54:48,575 - ERROR - Error extracting text from PDF: Unable to get page count. Is poppler installed and in PATH?
2025-02-28 01:54:48,622 - INFO - Successfully connected to Google Cloud Document AI
2025-02-28 01:54:48,624 - INFO - Extracting text from ../../data/forms/1040-NR.pdf


{}


2025-02-28 01:54:49,972 - ERROR - Google Cloud Document AI processing failed: 401 Request had invalid authentication credentials. Expected OAuth 2 access token, login cookie or other valid authentication credential. See https://developers.google.com/identity/sign-in/web/devconsole-project. [reason: "ACCESS_TOKEN_EXPIRED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "documentai.googleapis.com"
}
metadata {
  key: "method"
  value: "google.cloud.documentai.v1.DocumentProcessorService.ProcessDocument"
}
]
2025-02-28 01:54:49,973 - ERROR - Error extracting text from PDF: 401 Request had invalid authentication credentials. Expected OAuth 2 access token, login cookie or other valid authentication credential. See https://developers.google.com/identity/sign-in/web/devconsole-project. [reason: "ACCESS_TOKEN_EXPIRED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "documentai.googleapis.com"
}
metadata {
  key: "method"
  value: "google.cloud.documentai.v1.Document

{}


In [29]:
image_path = "../../data/forms/1040-NR.pdf"
transaction = process_receipt(image_path)  # No need for credentials_path
df = pd.DataFrame([transaction])
print(df)

TypeError: process_receipt() missing 1 required positional argument: 'credentials_path'

In [None]:
df = processor.process_multiple_receipts("path_to_receipts_dir")
    print(df)
    df.to_csv("transactions.csv", index=False)
    
    # 2. Using Google Cloud Document AI (uncomment and provide your details)
    processor = ReceiptProcessor(
        use_google_cloud=True,
        credentials_path="path_to_credentials.json",
        project_id="your_project_id",
        processor_id="your_processor_id"
     )
    transaction = processor.process_receipt("path_to_receipt.pdf")
    print(transaction)

IndentationError: unexpected indent (3903784213.py, line 2)

In [25]:
image_path = "../../data/forms/1040-NR.pdf"
credentials_path = "../../credentials.json"
transaction = process_receipt(image_path, credentials_path)
df = pd.DataFrame([transaction])
print(df)

Unauthenticated: 401 Request had invalid authentication credentials. Expected OAuth 2 access token, login cookie or other valid authentication credential. See https://developers.google.com/identity/sign-in/web/devconsole-project. [reason: "ACCESS_TOKEN_EXPIRED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "documentai.googleapis.com"
}
metadata {
  key: "method"
  value: "google.cloud.documentai.v1.DocumentProcessorService.ProcessDocument"
}
]

In [5]:
def process_receipts(image_paths):
    """Process multiple receipt images and save structured data."""
    results = []

    for image_path in image_paths:
        try:
            receipt_data = scan_receipt(image_path)  # OCR extraction
            results.append(receipt_data)
            print(f"Processed {image_path}: {receipt_data}")
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

    # Convert results to DataFrame
    df = pd.DataFrame(results)

    # Load deduction policy for categories
    with open("../../data/deduction_policy.json", "r") as f:
        deduction_policy = json.load(f)

    # Encode categorical values
    label_encoders = {col: LabelEncoder() for col in ["category", "merchant", "payment_method"]}

    for col in label_encoders:
        df[col] = label_encoders[col].fit_transform(df[col])

    # Normalize amount
    df["amount"] = (df["amount"] - df["amount"].min()) / (df["amount"].max() - df["amount"].min())

    # Save to CSV
    df.to_csv("receipts_data.csv", index=False)
    print("Receipts saved to receipts_data.csv ✅")

    return df

# Example usage
image_files = ["receipt1.jpg", "receipt2.png"]  # List of receipt images
processed_data = process_receipts(image_files)

# Print structured results
print(json.dumps(processed_data, indent=2))

Error processing receipt1.jpg: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.
Error processing receipt2.png: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.


NameError: name 'json' is not defined