In [None]:
# !pip install weave google-generativeai


In [None]:
import os
import random
import json
import datetime
from typing import TypedDict, Optional
import enum
import google.generativeai as genai
import time

In [None]:
# import weave
# import wandb
from dotenv import load_dotenv


In [None]:
load_dotenv()
genai.configure(api_key=os.environ.get("GEMINI_API_KEY"))
# wandb.login(key=os.environ.get("WANDB_API_KEY"))
# weave.init(project_name="expense-bot")

In [None]:
def upload_to_gemini(path, mime_type=None):
  """Uploads the given file to Gemini.

  See https://ai.google.dev/gemini-api/docs/prompting_with_media
  """
  file = genai.upload_file(path, mime_type=mime_type)
  print(f"Uploaded file '{file.display_name}' as: {file.uri}")
  return file

In [None]:
# Function to read already processed images from output files
def get_processed_images(*output_files):
    """Reads the output files and returns a set of processed image names."""
    processed_images = set()
    for output_file in output_files:
        if os.path.exists(output_file):
            with open(output_file, 'r') as f:
                for line in f:
                    try:
                        data = json.loads(line)
                        image_name = data.get('image')
                        if image_name:
                            processed_images.add(image_name)
                    except json.JSONDecodeError:
                        continue
    return processed_images

def process_receipts(
    directory,
    num_images=None,
    validation_split=0.2,
    output_train_file="train_output.jsonl",
    output_validation_file="validation_output.jsonl"
):
    """Processes images in the directory, extracting information from receipts.

    Args:
        directory (str): Path to the directory containing receipt images.
        num_images (int, optional): Number of images to process. If None, process all images.
        validation_split (float): Proportion of data to use for validation (e.g., 0.2 for 20%).
        output_train_file (str): Filename for training data output.
        output_validation_file (str): Filename for validation data output.
    """
    # List all files in the directory
    files = os.listdir(directory)
    # Filter to only image files (e.g., .jpeg, .jpg, .png)
    image_files = [f for f in files if f.lower().endswith(('.jpeg', '.jpg', '.png'))]

    # If num_images is specified, limit the number of images
    if num_images is not None:
        image_files = image_files[:num_images]

    # Shuffle the image files
    random.shuffle(image_files)

    # Split into training and validation sets
    split_index = int(len(image_files) * (1 - validation_split))
    train_images = image_files[:split_index]
    validation_images = image_files[split_index:]

    print(f"Total images: {len(image_files)}")
    print(f"Training images: {len(train_images)}")
    print(f"Validation images: {len(validation_images)}")

    # Get today's date
    today = datetime.date.today()

    # Define the Categories and Subcategories
    expense_categories = [
        {
            "category": "Office and Administrative",
            "subcategories": [
                "Stationery and Printing",
                "Postage and Courier",
                "Telephone and Internet",
                "Office Rent",
                "Utilities (Electricity, Gas, Water)",
                "Office Insurance",
                "Office Maintenance and Repairs",
                "Office Equipment (under £1,000)",
                "Software Subscriptions",
                "Domain and Hosting Fees"
            ]
        },
        {
            "category": "Employee Costs",
            "subcategories": [
                "Salaries and Wages",
                "Employer's National Insurance Contributions",
                "Pension Contributions",
                "Staff Training",
                "Employee Benefits",
                "Recruitment Costs",
                "Staff Uniforms",
                "Staff Entertainment"
            ]
        },
        {
            "category": "Travel and Transportation",
            "subcategories": [
                "Vehicle Fuel",
                "Vehicle Insurance",
                "Vehicle Maintenance and Repairs",
                "Vehicle Road Tax",
                "Parking Fees",
                "Public Transportation Fares",
                "Accommodation for Business Trips",
                "Meals During Business Travel",
                "Mileage Allowance"
            ]
        },
        {
            "category": "Professional Services",
            "subcategories": [
                "Accountancy Fees",
                "Legal Fees",
                "Consulting Fees",
                "IT Support Services"
            ]
        },
        {
            "category": "Marketing and Advertising",
            "subcategories": [
                "Advertising Costs",
                "Marketing Materials",
                "Website Development and Maintenance",
                "Social Media Marketing",
                "Trade Show Expenses",
                "Promotional Items"
            ]
        },
        {
            "category": "Financial Expenses",
            "subcategories": [
                "Bank Charges",
                "Credit Card Fees",
                "Loan Interest",
                "Merchant Fees",
                "Bad Debts"
            ]
        },
        {
            "category": "Insurance",
            "subcategories": [
                "Professional Indemnity Insurance",
                "Public Liability Insurance",
                "Employers' Liability Insurance",
                "Contents Insurance"
            ]
        },
        {
            "category": "Subscriptions and Memberships",
            "subcategories": [
                "Professional Body Memberships",
                "Trade Association Fees",
                "Magazine and Journal Subscriptions"
            ]
        },
        {
            "category": "Research and Development",
            "subcategories": [
                "R&D Materials",
                "R&D Equipment",
                "R&D Consulting Fees"
            ]
        },
        {
            "category": "Depreciation and Capital Allowances",
            "subcategories": [
                "Depreciation of Assets",
                "Annual Investment Allowance",
                "Writing Down Allowance"
            ]
        },
        {
            "category": "Rent and Rates",
            "subcategories": [
                "Business Premises Rent",
                "Business Rates"
            ]
        },
        {
            "category": "Repairs and Maintenance",
            "subcategories": [
                "Building Repairs",
                "Equipment Maintenance"
            ]
        },
        {
            "category": "Cost of Goods Sold",
            "subcategories": [
                "Raw Materials",
                "Direct Labor",
                "Manufacturing Overheads"
            ]
        },
        {
            "category": "Charitable Donations",
            "subcategories": [
                "Donations to Registered Charities"
            ]
        },
        {
            "category": "Miscellaneous Expenses",
            "subcategories": [
                "Cleaning Services",
                "Security Services",
                "Health and Safety Expenses",
                "Waste Disposal"
            ]
        },
        {
            "category": "Home Office Expenses",
            "subcategories": [
                "Proportion of Rent or Mortgage Interest",
                "Proportion of Utilities",
                "Proportion of Council Tax",
                "Proportion of Home Insurance"
            ]
        }
    ]

    # VAT Percentages per Category
    vat_percentages = {
        "Office and Administrative": 20.0,
        "Employee Costs": 20.0,
        "Travel and Transportation": 0.0,
        "Professional Services": 20.0,
        "Marketing and Advertising": 20.0,
        "Financial Expenses": 0.0,
        "Insurance": 0.0,
        "Subscriptions and Memberships": 0.0,
        "Research and Development": 20.0,
        "Depreciation and Capital Allowances": 0.0,
        "Rent and Rates": 20.0,
        "Repairs and Maintenance": 20.0,
        "Cost of Goods Sold": 20.0,
        "Charitable Donations": 0.0,
        "Miscellaneous Expenses": 20.0,
        "Home Office Expenses": 0.0
    }

    # Define the schema using TypedDict
    class TransactionDetails(TypedDict):
        transaction_date: str  # YYYY-MM-DD format
        transaction_timestamp: str  # YYYY-MM-DD HH:MM:SS format
        amount: float  # numeric value
        currency: str  # e.g., "GBP"
        description: str
        transaction_type: str  # e.g., "Expense"
        category: str
        subcategory: str
        payment_method: str  # e.g., "Card"
        merchant: str
        transaction_id: Optional[str]  # Optional
        vat_percentage: float  # e.g., 20.0
        original_currency: Optional[str]  # If not GBP
        original_amount: Optional[float]  # If not GBP

    # Create the model with the response schema
    generation_config = genai.GenerationConfig(
        temperature=0.2,
        top_p=0.9,
        top_k=40,
        max_output_tokens=1024,
        response_mime_type="application/json",
        response_schema=TransactionDetails,
    )

    model = genai.GenerativeModel(
        model_name="gemini-1.5-pro",
        generation_config=generation_config,
    )

    # Get the set of already processed images
    processed_images = get_processed_images(output_train_file, output_validation_file)

    # Function to process a list of images and write outputs to a file
    def process_images(image_list, output_file):
        # Open the output file in append mode to avoid overwriting existing data
        with open(output_file, 'a') as f:
            for image_file in image_list:
                if image_file in processed_images:
                    print(f"Image '{image_file}' has already been processed. Skipping.")
                    continue

                image_path = os.path.join(directory, image_file)

                max_retries = 5
                retry_delay = 1  # Start with 1 second delay
                retries = 0

                while retries < max_retries:
                    try:
                        # Upload the image to Gemini
                        uploaded_file = upload_to_gemini(image_path, mime_type="image/jpeg")
                        print(f"Uploaded file '{image_file}' as: {uploaded_file.uri}")

                     # Add user input for the monthly budget and calculate the weekly budget
monthly_budget = float(input("Enter your monthly budget: "))
weekly_budget = monthly_budget / 4

# Prompt user for alert preference for market price changes
alert_for_marketprice_changes = input("Do you want alert for market price changes? (Yes/No): ").strip().lower()

# Validate and process the input
if alert_for_marketprice_changes == "yes":
    alert_preference = True
elif alert_for_marketprice_changes == "no":
    alert_preference = False
else:
    raise ValueError("Invalid input! Please enter 'Yes' or 'No'.")

print(f"Alert for market price changes: {alert_preference}")


# Update message content to include the weekly budget in the prompt
prompt = f"""Analyze this receipt image and extract transaction details.
Today's date is {today.strftime('%Y-%m-%d')}.
Select the category and subcategory from the predefined list of expense categories and subcategories.

Weekly Budget: £{weekly_budget:.2f}

Expense Categories and Subcategories:
{json.dumps(expense_categories, indent=2)}

VAT Percentages per Category:
{json.dumps(vat_percentages, indent=2)}

Important:
- Amount must be a number (not a string)
- Date must be in YYYY-MM-DD format
- Transaction timestamp must be in YYYY-MM-DD HH:MM:SS format
    - If time is not available, set time to "00:00:00"
- If currency is not GBP, provide the original amount and currency
- Include the VAT percentage based on the category as per the list above

Return the data in the JSON format specified by the response schema.
"""

                        # Prepare the message content with the prompt and weekly budget

                        message_content = [uploaded_file, prompt]

                        # Start a chat session
                        chat_session = model.start_chat()

                        # Send the message to the model
                        response = chat_session.send_message(message_content)

                        # Get the AI's response text
                        ai_response = response.text.strip()

                        # Create the JSON object for this image
                        result = {
                            "prefix": "",
                            "suffix": ai_response,  # Store the AI response as is
                            "image": image_file
                        }

                        # Write the result to the JSONL file
                        json_line = json.dumps(result)
                        f.write(json_line + '\n')

                        # Print the response
                        print(f"\nProcessed image '{image_file}':\n{json_line}\n")

                        # Add image to processed_images set
                        processed_images.add(image_file)

                        # Break out of the retry loop if successful
                        break

                    except Exception as e:
                        print(f"Error processing image '{image_file}': {e}")
                        retries += 1
                        print(f"Retrying ({retries}/{max_retries}) after {retry_delay} seconds...")
                        time.sleep(retry_delay)
                        retry_delay += 2  # Incremental backoff

                else:
                    # If max retries reached, log the failure
                    print(f"Failed to process image '{image_file}' after {max_retries} retries.")

    # Process training images
    print("\nProcessing training images...\n")
    process_images(train_images, output_train_file)
    print(f"Training data written to {output_train_file}")

    # Process validation images
    print("\nProcessing validation images...\n")
    process_images(validation_images, output_validation_file)
    print(f"Validation data written to {output_validation_file}")


In [None]:
def weekly_budget_alert(output_file, weekly_budget):
    """Calculates weekly expenses and alerts if the budget is exceeded."""
    from datetime import datetime, timedelta

    today = datetime.now()
    one_week_ago = today - timedelta(days=7)

    total_spent = 0
    try:
        with open(output_file, 'r') as f:
            for line in f:
                data = json.loads(line)
                transaction_date = data.get("suffix", {}).get("transaction_date", None)
                amount = data.get("suffix", {}).get("amount", 0)

                # Convert the transaction date to datetime and check if it falls within the past week
                if transaction_date:
                    transaction_date = datetime.strptime(transaction_date, "%Y-%m-%d")
                    if one_week_ago <= transaction_date <= today:
                        total_spent += amount

        print(f"Weekly Spending: £{total_spent:.2f}")
        if total_spent > weekly_budget:
            print(f" Alert: You have exceeded your weekly budget of £{weekly_budget:.2f} by £{total_spent - weekly_budget:.2f}!")
        else:
            print(f" You are within your weekly budget of £{weekly_budget:.2f}.")

    except FileNotFoundError:
        print(f"File '{output_file}' not found.")
    except json.JSONDecodeError:
        print("Error decoding JSON from the output file.")

# Example usage
weekly_budget_alert("train_output.jsonl",)


In [None]:
pip install beautifulsoup4 requests
from bs4 import BeautifulSoup
import requests

def monitor_market_prices(regular_items, url):
    """
    Scrapes market prices from the given URL and alerts if there are changes
    for regularly purchased items.

    Args:
        regular_items (dict): A dictionary of items and their expected prices, e.g., {"item_name": 10.0}.
        url (str): The URL of the market price page to scrape.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Simulate scraping: This needs to be adapted to the actual site's structure
        # Example: Assuming prices are in <div class="price"> elements and item names in <span class="item-name">
        market_prices = {}
        for item in soup.find_all("div", class_="item"):
            name = item.find("span", class_="item-name").text.strip()
            price = float(item.find("span", class_="price").text.strip().replace("£", ""))
            market_prices[name] = price

        # Compare scraped prices with regular items
        for item, expected_price in regular_items.items():
            current_price = market_prices.get(item)
            if current_price:
                if current_price != expected_price:
                    print(f" Price Alert: '{item}' price changed from £{expected_price:.2f} to £{current_price:.2f}.")
            else:
                print(f"'{item}' not found in the market price data.")

    except requests.RequestException as e:
        print(f"Error fetching market prices: {e}")

# Example usage
regular_items = {"Milk": 1.50, "Bread": 1.00, "Eggs": 2.00}
monitor_market_prices(regular_items, "https://example.com/market-prices")


In [None]:
process_receipts(
    "./Receipt.v1i.paligemma/dataset",
    # num_images=3,
    validation_split=0.2
)


Total images: 1193
Training images: 954
Validation images: 239

Processing training images...

Uploaded file '0036_jpg.rf.463e814e9a2a40548e9c31c8085cf785.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/bckj03q0j4q4
Uploaded file '0036_jpg.rf.463e814e9a2a40548e9c31c8085cf785.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/bckj03q0j4q4

Processed image '0036_jpg.rf.463e814e9a2a40548e9c31c8085cf785.jpg':
{"prefix": "", "suffix": "{\"amount\": 991.7, \"category\": \"Cost of Goods Sold\", \"currency\": \"INR\", \"description\": \"Invoice from Crane Pool & Schmidt Pte Ltd\", \"merchant\": \"Crane Pool & Schmidt Pte Ltd\", \"transaction_date\": \"2022-05-15\", \"transaction_timestamp\": \"2022-05-15 00:00:00\", \"vat_percentage\": 20.0}", "image": "0036_jpg.rf.463e814e9a2a40548e9c31c8085cf785.jpg"}

Uploaded file '93_jpg.rf.88f2f31c8afa52963367cc94b06225aa.jpg' as: https://generativelanguage.googleapis.com/v1beta/files/dvb2vky379xv
Uploaded file '93_jpg.rf.88f