In [1]:
import re
from google.cloud import vision
import io
import pandas as pd

def scan_receipt(image_path):
    client = vision.ImageAnnotatorClient()
    
    with io.open(image_path, "rb") as image_file:
        content = image_file.read()
    
    image = vision.Image(content=content)
    response = client.text_detection(image=image)
    
    if response.error.message:
        raise Exception(f"Google Vision API Error: {response.error.message}")
    
    extracted_text = response.text_annotations[0].description if response.text_annotations else ""
    
    # Extract amount (Assuming format "$XX.XX" or "XX.XX")
    amount_match = re.search(r"\$\d+\.\d{2}|\d+\.\d{2}", extracted_text)
    amount = float(amount_match.group()) if amount_match else 0.0
    
    # Expanded category detection with more refined keywords
    category_map = {
        "Home Office": ["home office", "workspace", "coworking", "desk", "chair"],
        "Software & Subscriptions": ["software", "subscription", "saas", "adobe", "microsoft", "google workspace", "zoom", "notion"],
        "Freelancer Platform Fees": ["upwork", "fiverr", "freelancer.com", "peopleperhour"],
        "Work Equipment": ["laptop", "monitor", "keyboard", "mouse", "printer", "scanner", "webcam"],
        "Internet & Phone": ["internet", "wifi", "broadband", "phone", "cellular", "data plan"],
        "Education & Training": ["course", "training", "udemy", "coursera", "edx", "bootcamp", "certification", "workshop"],
        "Business Travel": ["uber", "taxi", "airline", "hotel", "flight", "rental car", "bus", "train ticket", "airbnb"],
        "Client Entertainment": ["restaurant", "food", "dining", "cafe", "lounge", "bar", "club"],
        "Office Supplies": ["staples", "office depot", "stationery", "paper", "printer ink", "folders", "envelopes"],
        "Software & Tools": ["development tools", "github", "figma", "notion", "slack", "dropbox", "gitlab"],
        "Meals & Entertainment": ["restaurant", "dining", "snacks", "coffee", "fast food", "takeout", "delivery"],
        "Healthcare": ["doctor", "medical", "insurance", "clinic", "dentist", "pharmacy", "hospital"],
        "Personal Shopping": ["mall", "clothing", "fashion", "shoes", "jewelry", "watches"],
        "Rent & Utilities": ["rent", "electricity", "water bill", "gas bill", "landlord", "lease", "mortgage"],
        "Marketing & Advertising": ["facebook ads", "google ads", "linkedin ads", "instagram ads", "youtube ads", "seo"],
        "Transportation": ["gas station", "metro", "bus ticket", "car rental", "parking", "toll", "public transport"],
        "Groceries": ["supermarket", "grocery", "whole foods", "walmart", "aldi", "kroger", "costco", "trader joe's"],
        "Insurance": ["health insurance", "car insurance", "business insurance", "liability insurance"],
        "Legal & Accounting": ["lawyer", "attorney", "legal fees", "accounting", "tax services", "bookkeeping"],
        "Subscriptions": ["netflix", "spotify", "amazon prime", "newspaper", "magazine", "membership"],
        "Donations & Charity": ["charity", "donation", "ngo", "fundraiser"],
        "Hardware & Maintenance": ["repair", "fix", "replacement", "maintenance", "service fee"],
        "Childcare & Dependent Care": ["daycare", "babysitter", "nanny", "childcare"],
        "Fitness & Wellness": ["gym", "yoga", "meditation", "fitness class", "personal trainer"]
    }
    
    category = "Unknown"
    for cat, keywords in category_map.items():
        if any(keyword.lower() in extracted_text.lower() for keyword in keywords):
            category = cat
            break

    return {"category": category, "amount": amount, "raw_text": extracted_text}


In [5]:
def process_receipts(image_paths):
    """Process multiple receipt images and save structured data."""
    results = []

    for image_path in image_paths:
        try:
            receipt_data = scan_receipt(image_path)  # OCR extraction
            results.append(receipt_data)
            print(f"Processed {image_path}: {receipt_data}")
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

    # Convert results to DataFrame
    df = pd.DataFrame(results)

    # Load deduction policy for categories
    with open("../../data/deduction_policy.json", "r") as f:
        deduction_policy = json.load(f)

    # Encode categorical values
    label_encoders = {col: LabelEncoder() for col in ["category", "merchant", "payment_method"]}

    for col in label_encoders:
        df[col] = label_encoders[col].fit_transform(df[col])

    # Normalize amount
    df["amount"] = (df["amount"] - df["amount"].min()) / (df["amount"].max() - df["amount"].min())

    # Save to CSV
    df.to_csv("receipts_data.csv", index=False)
    print("Receipts saved to receipts_data.csv ✅")

    return df

# Example usage
image_files = ["receipt1.jpg", "receipt2.png"]  # List of receipt images
processed_data = process_receipts(image_files)

# Print structured results
print(json.dumps(processed_data, indent=2))

Error processing receipt1.jpg: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.
Error processing receipt2.png: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.


NameError: name 'json' is not defined