<a href="https://colab.research.google.com/github/phuocnguyen90/Random-projects/blob/main/Receipt_OCR_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
# Install system dependencies for Tesseract OCR and PDF processing
!sudo apt-get update
!sudo apt-get install tesseract-ocr tesseract-ocr-vie poppler-utils -y

# Install Python libraries
!pip install streamlit pyngrok pytesseract pdf2image paddleocr paddlepaddle openai gspread google-auth python-dotenv llama-parse Pillow -q

# Pillow is usually pre-installed, but explicit install ensures it's there.
# Use paddlepaddle (CPU) for simplicity in Colab unless you have a GPU runtime and want GPU acceleration (paddlepaddle-gpu)
# llama-parse is used here as an example API parser. Replace/add others if needed.
# openai is used for the LLM extraction part. Replace/add others (e.g., anthropic, google-generativeai) if needed.

In [None]:
# Cell 2: Get AND Validate Credentials before writing .env

import os
import json
from getpass import getpass
import gspread
from google.oauth2.service_account import Credentials
from google.auth.exceptions import DefaultCredentialsError
import openai
import sys # To check if running in Colab

print("--- Credential Input ---")
print("Please enter your details below.")
print("NOTE: Ensure Cell 1 (Installations) has finished and you have uploaded your Google Service Account JSON file.")

# --- 1. Get User Inputs ---
openai_api_key = getpass('Enter your OpenAI API Key (leave blank if not using): ')
llamaparse_api_key = getpass('Enter your LlamaParse API Key (optional, leave blank if not using): ')
ngrok_auth_token = getpass('Enter your NGROK Auth Token (leave blank if not using): ')
google_sheet_name = input('Enter the exact name of your Google Sheet: ')
google_creds_path = input('Enter the path to your Google Service Account JSON key file (e.g., "my-google-creds.json"): ')

# --- 2. Perform Validations ---
print("\n--- Credential Validation ---")

# Flags to track success
openai_ok = False
llamaparse_provided = bool(llamaparse_api_key) # Just track if provided, full validation is hard here
gsheet_name_ok = False
gcreds_file_ok = False
gcreds_valid_format = False
gsheet_access_ok = False
apis_enabled_ok = True # Assume okay initially, check during gsheet access

# --- OpenAI Key Check ---
if openai_api_key:
    print("Checking OpenAI API Key...")
    try:
        # Simple test: List models (low cost)
        temp_client = openai.OpenAI(api_key=openai_api_key)
        temp_client.models.list()
        openai_ok = True
        print("✅ OpenAI API Key seems valid.")
    except openai.AuthenticationError:
        print("❌ ERROR: OpenAI API Key is invalid (AuthenticationError).")
    except Exception as e:
        print(f"⚠️ WARNING: Could not fully validate OpenAI key ({type(e).__name__}). Check connectivity or key later.")
        # We might still proceed but warn the user
else:
    print("ℹ️ INFO: OpenAI API Key not provided. LLM extraction will be disabled.")
    openai_ok = True # Treat as 'ok' in terms of not blocking .env creation

# --- LlamaParse Key Check (Basic) ---
if llamaparse_provided:
    # Hard to validate without making a real parsing call. Just acknowledge.
    print("ℹ️ INFO: LlamaParse API Key provided (basic check only).")
else:
    print("ℹ️ INFO: LlamaParse API Key not provided. LlamaParse option will be disabled.")

# --- Google Sheet Name Check ---
if google_sheet_name and google_sheet_name.strip():
    gsheet_name_ok = True
    print(f"✅ Google Sheet Name provided: '{google_sheet_name}'")
else:
    print("❌ ERROR: Google Sheet Name cannot be empty.")

# --- Google Credentials File Existence Check ---
# Check if running in Colab or similar environment where path matters directly
is_colab = 'google.colab' in sys.modules
if is_colab and not google_creds_path.startswith('/content/'):
     print(f"⚠️ WARNING: Credential path '{google_creds_path}' doesn't start with '/content/'. Ensure it's the correct path in the Colab file browser.")

if os.path.exists(google_creds_path):
    gcreds_file_ok = True
    print(f"✅ Google Credentials file found at: '{google_creds_path}'")

    # --- Google Credentials Format and Auth Check ---
    print("Checking Google Credentials and Sheet access...")
    try:
        scopes = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive"]
        creds = Credentials.from_service_account_file(google_creds_path, scopes=scopes)
        gcreds_valid_format = True
        print("✅ Google Credentials file format seems valid.")

        # --- Google Sheet Access Check ---
        try:
            client = gspread.authorize(creds)
            # Try opening the sheet - this checks sharing and sheet existence
            spreadsheet = client.open(google_sheet_name)
            # Optionally, try accessing a property to be more certain
            _ = spreadsheet.sheet1.title
            gsheet_access_ok = True
            print(f"✅ Successfully accessed Google Sheet: '{google_sheet_name}'")

        except gspread.exceptions.SpreadsheetNotFound:
            print(f"❌ ERROR: Google Sheet '{google_sheet_name}' not found. Check the name and ensure it's shared with the service account email: {creds.service_account_email}")
        except gspread.exceptions.APIError as e:
            apis_enabled_ok = False # Mark API enablement as failed
            if 'insufficient authentication scopes' in str(e) or '403' in str(e):
                 print(f"❌ ERROR: Google API Error (403 - Likely scopes/permissions). Ensure 'Google Sheets API' and 'Google Drive API' are ENABLED in your Google Cloud project.")
                 print(f"Service account email: {creds.service_account_email}")
            else:
                 print(f"❌ ERROR: Google API Error accessing sheet: {e}")
                 print(f"Service account email: {creds.service_account_email}")
        except Exception as e: # Catch other potential gspread errors
            print(f"❌ ERROR: Failed to authorize or access Google Sheet: {e}")
            print(f"Service account email: {creds.service_account_email if 'creds' in locals() else 'N/A'}")


    except FileNotFoundError: # Should not happen if os.path.exists passed, but belt-and-suspenders
        print(f"❌ ERROR: Google Credentials file disappeared after initial check? Path: '{google_creds_path}'")
        gcreds_file_ok = False # Correct flag
    except (json.JSONDecodeError, ValueError) as e: # Catch bad JSON format or structure issues
        print(f"❌ ERROR: Google Credentials file is not valid JSON or has incorrect structure: {e}")
    except DefaultCredentialsError as e:
        print(f"❌ ERROR: Google Authentication failed. Could not find credentials: {e}")
    except Exception as e: # Catch-all for other credential loading issues
        print(f"❌ ERROR: Failed to load Google Credentials: {e}")

else:
    print(f"❌ ERROR: Google Credentials file not found at '{google_creds_path}'. Please upload it and ensure the path is correct.")
    # Ensure subsequent checks depending on the file don't run implicitly

# --- 3. Decide and Write .env File ---
print("\n--- Summary & .env Creation ---")

# Define essential checks for writing the .env file
# We MUST have the GSheet name and a valid-looking, accessible GCreds file + sheet access
essentials_ok = gsheet_name_ok and gcreds_file_ok and gcreds_valid_format and gsheet_access_ok and apis_enabled_ok

if essentials_ok:
    print("✅ All essential Google credentials validated successfully.")
    if not openai_ok and openai_api_key:
        print("⚠️ WARNING: OpenAI API Key validation failed or was skipped. LLM features might not work.")

    try:
        with open(".env", "w") as f:
            if openai_api_key: # Only write if provided
                f.write(f"OPENAI_API_KEY={openai_api_key}\n")
            if llamaparse_api_key: # Only write if provided
                f.write(f"LLAMAPARSE_API_KEY={llamaparse_api_key}\n")
            if ngrok_auth_token:
                f.write(f"NGROK_AUTH_TOKEN={ngrok_auth_token}\n")
            f.write(f"GOOGLE_SHEET_NAME={google_sheet_name}\n")
            f.write(f"GOOGLE_CREDS_PATH={google_creds_path}\n")
        print("✅ .env file created successfully.")
        print("You can now proceed to run Cell 3 (Write app.py) and Cell 4 (Run Streamlit).")
    except Exception as e:
        print(f"❌ ERROR: Failed to write .env file: {e}")

else:
    print("❌ ERROR: Essential credential validation failed. Cannot write .env file.")
    print("Please review the errors above, correct the inputs or configurations (like API enablement or sheet sharing), and rerun this cell.")
    if not gsheet_name_ok: print("   - Issue: Google Sheet Name missing.")
    if not gcreds_file_ok: print("   - Issue: Google Credentials file path incorrect or file missing.")
    if gcreds_file_ok and not gcreds_valid_format: print("   - Issue: Google Credentials file format invalid.")
    if gcreds_valid_format and not apis_enabled_ok: print("   - Issue: Google Sheets/Drive API likely not enabled in Cloud project.")
    if gcreds_valid_format and apis_enabled_ok and not gsheet_access_ok: print("   - Issue: Google Sheet not found or not shared correctly with service account.")

--- Credential Input ---
Please enter your details below.
NOTE: Ensure Cell 1 (Installations) has finished and you have uploaded your Google Service Account JSON file.
Enter your OpenAI API Key (leave blank if not using): ··········
Enter your LlamaParse API Key (optional, leave blank if not using): ··········
Enter the exact name of your Google Sheet: receipt
Enter the path to your Google Service Account JSON key file (e.g., "my-google-creds.json"): /content/elite-height-454109-g4-33c0427ef366.json

--- Credential Validation ---
Checking OpenAI API Key...
✅ OpenAI API Key seems valid.
ℹ️ INFO: LlamaParse API Key provided (basic check only).
✅ Google Sheet Name provided: 'receipt'
✅ Google Credentials file found at: '/content/elite-height-454109-g4-33c0427ef366.json'
Checking Google Credentials and Sheet access...
✅ Google Credentials file format seems valid.
✅ Successfully accessed Google Sheet: 'receipt'

--- Summary & .env Creation ---
✅ All essential Google credentials validated su

In [None]:
%%writefile app.py
import streamlit as st
import os
import json
from PIL import Image
import pytesseract
from pdf2image import convert_from_bytes
from paddleocr import PaddleOCR
import openai
import gspread
from google.oauth2.service_account import Credentials
from llama_parse import LlamaParse # Example API Parser
from dotenv import load_dotenv
import io
import datetime
import pandas as pd
import numpy as np # Paddle requires numpy
import asyncio # For potential async needs like LlamaParse

# --- Page Config MUST BE THE FIRST STREAMLIT COMMAND ---
st.set_page_config(layout="wide") # <-- MOVED HERE

# --- Configuration & Initialization ---
load_dotenv() # Load environment variables from .env file

# Get credentials from environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LLAMAPARSE_API_KEY = os.getenv("LLAMAPARSE_API_KEY")
GOOGLE_SHEET_NAME = os.getenv("GOOGLE_SHEET_NAME")
GOOGLE_CREDS_PATH = os.getenv("GOOGLE_CREDS_PATH")
NGROK_AUTH_TOKEN = os.getenv("NGROK_AUTH_TOKEN")

# --- Helper Functions (Updated for Vietnamese) ---
# Define functions before they are called during initialization if needed

def process_image_tesseract(image_bytes):
    """Performs OCR on image bytes using Tesseract for Vietnamese."""
    try:
        image = Image.open(io.BytesIO(image_bytes))
        # Use 'vie' for Vietnamese language pack
        text = pytesseract.image_to_string(image, lang='vie')
        return text
    except Exception as e:
        st.error(f"Tesseract OCR failed: {e}")
        return None

def process_pdf_tesseract(pdf_bytes):
    """Converts PDF to images and performs OCR using Tesseract for Vietnamese."""
    text = ""
    try:
        images = convert_from_bytes(pdf_bytes)
        for i, image in enumerate(images):
            st.write(f"Processing PDF page {i+1} with Tesseract (Vietnamese)...")
            # Use 'vie' for Vietnamese language pack
            text += pytesseract.image_to_string(image, lang='vie') + "\n\n"
        return text
    except Exception as e:
        st.error(f"Tesseract PDF processing failed: {e}")
        return None

def process_image_paddle(image_bytes, paddle_ocr_instance):
    """Performs OCR on image bytes using PaddleOCR (already configured for Vietnamese)."""
    if paddle_ocr_instance is None:
        st.error("PaddleOCR is not initialized.")
        return None
    try:
        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        image_np = np.array(image)
        result = paddle_ocr_instance.ocr(image_np, cls=True)
        text = ""
        if result and result[0]:
            for line in result[0]:
                # line[1][0] is the recognized text
                text += line[1][0] + "\n"
        return text
    except Exception as e:
        st.error(f"PaddleOCR failed: {e}")
        return None

def process_pdf_paddle(pdf_bytes, paddle_ocr_instance):
    """Converts PDF to images and performs OCR using PaddleOCR (already configured for Vietnamese)."""
    if paddle_ocr_instance is None:
        st.error("PaddleOCR is not initialized.")
        return None
    text = ""
    try:
        images = convert_from_bytes(pdf_bytes)
        for i, image in enumerate(images):
            st.write(f"Processing PDF page {i+1} with PaddleOCR (Vietnamese)...")
            image_np = np.array(image.convert("RGB"))
            result = paddle_ocr_instance.ocr(image_np, cls=True)
            if result and result[0]:
                for line in result[0]:
                    text += line[1][0] + "\n"
            text += "\n\n" # Add separator between pages
        return text
    except Exception as e:
        st.error(f"PaddleOCR PDF processing failed: {e}")
        return None

def process_file_llamaparse(file_bytes, filename, parser_instance):
    """Uses LlamaParse API to extract text from image or PDF."""
    if parser_instance is None:
        st.error("LlamaParse API key not found or parser not initialized.")
        return None
    try:
        temp_filepath = f"./temp_{filename}"
        with open(temp_filepath, "wb") as f:
            f.write(file_bytes)

        # Using asyncio properly within Streamlit can be tricky.
        # This approach tries to get/create an event loop.
        try:
            loop = asyncio.get_event_loop()
        except RuntimeError:
            loop = asyncio.new_event_loop()
            asyncio.set_event_loop(loop)

        documents = loop.run_until_complete(parser_instance.aload_data(temp_filepath))

        os.remove(temp_filepath) # Clean up temp file
        if documents:
            return documents[0].text # Assuming result is a list of documents
        else:
            return None
    except Exception as e:
        st.error(f"LlamaParse API call failed: {e}")
        # Ensure cleanup even on failure
        if 'temp_filepath' in locals() and os.path.exists(temp_filepath):
            os.remove(temp_filepath)
        return None

def extract_data_with_llm(text):
    """Uses OpenAI's LLM to extract structured data from Vietnamese receipt text."""
    if not OPENAI_API_KEY:
        st.error("OpenAI API Key not configured.")
        return None
    if not text or not text.strip():
        st.warning("No text provided for extraction.")
        return None

    # Updated prompt for Vietnamese context
    prompt = f"""
    You are an expert assistant specialized in extracting information from Vietnamese receipts.
    The following text was extracted from a receipt, likely in Vietnamese.
    Extract the key information and format the output as a single JSON object.
    Use the exact English keys provided below. If a value is not found, use null or an empty string "".

    Keys to extract:
    - 'buyer_name': Name of the customer/buyer (Tên khách hàng).
    - 'buyer_address': Address of the customer/buyer (Địa chỉ khách hàng).
    - 'buyer_contact': Phone number or email of the customer/buyer (SĐT/Email khách hàng).
    - 'receipt_date': Date the receipt was issued (Ngày hóa đơn). Format this as YYYY-MM-DD. If the date is like DD/MM/YYYY or DD-MM-YYYY, convert it.
    - 'store_name': Name of the store/vendor (Tên cửa hàng / Đơn vị bán).
    - 'store_address': Address of the store/vendor (Địa chỉ cửa hàng).
    - 'total_amount': The final total amount paid (Tổng cộng / Tổng thanh toán). Provide only the numerical value, removing currency symbols like 'đ' or 'VND' and thousand separators like '.' or ','.
    - 'items': A list of items purchased. Each item MUST be an object with 'description' (Tên hàng / Mô tả), 'quantity' (Số lượng - SL), and 'price' (Đơn giá or Thành tiền). Extract numerical values for quantity and price.

    Important Notes:
    - The text is in Vietnamese. Pay attention to Vietnamese names, addresses, and date formats (DD/MM/YYYY).
    - For 'total_amount', 'quantity', and 'price', extract only numbers. Handle separators (like '.' for thousands in VND) correctly. For example, '50.000 đ' should become 50000.
    - Output only the JSON object, nothing else before or after it.

    Receipt Text (Vietnamese):
    ---
    {text}
    ---

    JSON Output:
    """

    try:
        response = openai.chat.completions.create(
            model="gpt-4o", # Consider GPT-4o or GPT-4 Turbo for better Vietnamese understanding
            messages=[
                {"role": "system", "content": "You are an expert receipt data extraction assistant specializing in Vietnamese documents."},
                {"role": "user", "content": prompt}
            ],
            response_format={"type": "json_object"}, # Enforce JSON output
            temperature=0.1, # Lower temperature for more deterministic output
        )
        content = response.choices[0].message.content
        extracted_data = json.loads(content)
        return extracted_data
    except json.JSONDecodeError:
        st.error(f"LLM returned invalid JSON. Raw content:\n```\n{content}\n```")
        return {"raw_llm_output": content} # Return raw content for debugging
    except Exception as e:
        st.error(f"OpenAI API call failed: {e}")
        return None

# Find this function within %%writefile app.py

def connect_to_gsheet():
    """Connects to Google Sheet using service account credentials."""
    if not GOOGLE_CREDS_PATH or not GOOGLE_SHEET_NAME:
        # Use st.toast for less intrusive messages if preferred
        st.error("Google Sheet name or credentials path not configured.")
        return None, None
    try:
        if not os.path.exists(GOOGLE_CREDS_PATH):
             st.error(f"Google credentials file not found at: {GOOGLE_CREDS_PATH}. Please upload it to the Colab environment.")
             return None, None

        # --- THIS IS THE LINE TO CHANGE ---
        scopes = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive"] # <-- ADDED DRIVE SCOPE
        # --- END OF CHANGE ---

        creds = Credentials.from_service_account_file(GOOGLE_CREDS_PATH, scopes=scopes)
        client = gspread.authorize(creds)
        try:
             sheet = client.open(GOOGLE_SHEET_NAME).sheet1
             return sheet, client
        except gspread.exceptions.SpreadsheetNotFound:
             st.error(f"Google Sheet '{GOOGLE_SHEET_NAME}' not found. Ensure the name is exact and the sheet is shared with the service account email.")
             return None, None
        except gspread.exceptions.APIError as api_e:
             # Check if it's specifically a scope error even with both scopes
             if 'insufficient authentication scopes' in str(api_e) or '403' in str(api_e):
                 st.error(f"❌ Google API Error (403): Still insufficient scopes reported. Double-check 'Google Sheets API' and 'Google Drive API' are ENABLED in Google Cloud Console for project '{creds.project_id}'.")
             else:
                 st.error(f"Google API Error accessing sheet '{GOOGLE_SHEET_NAME}': {api_e}. Check permissions and API enablement.")
             return None, None

    except Exception as e:
        st.error(f"Failed to connect to Google Sheets: {e}")
        return None, None

# Make sure the rest of Cell 3 remains the same

def append_to_gsheet(sheet, data):
    """Appends extracted data as a new row in the Google Sheet."""
    try:
        headers = [
            'Extraction Date', 'Buyer Name', 'Buyer Address', 'Buyer Contact',
            'Receipt Date', 'Store Name', 'Store Address', 'Total Amount',
            'Items JSON'
        ]
        header_row = []
        try:
            header_row = sheet.row_values(1)
        except gspread.exceptions.APIError as e:
            st.warning(f"Could not read header row: {e}. Assuming sheet is empty or permission issue.")

        if not header_row:
             sheet.append_row(headers)
             st.info("Added header row to empty Google Sheet.")
        elif header_row != headers:
             st.warning("Sheet headers don't match expected headers. Appending based on defined order. Please check your Google Sheet columns.")

        row = [
            datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            data.get('buyer_name', ''),
            data.get('buyer_address', ''),
            data.get('buyer_contact', ''),
            data.get('receipt_date', ''),
            data.get('store_name', ''),
            data.get('store_address', ''),
            data.get('total_amount', ''),
            json.dumps(data.get('items', []), ensure_ascii=False)
        ]
        sheet.append_row(row)
        return True
    except gspread.exceptions.APIError as e:
         st.error(f"Google Sheets API error during append: {e}")
         return False
    except Exception as e:
        st.error(f"Failed to append data to Google Sheet: {e}")
        return False

# --- Initialize Clients AFTER set_page_config ---
# We wrap initialization in functions or use st.cache_resource for better practice

@st.cache_resource # Cache the initialized OCR object
def get_paddle_ocr():
    try:
        paddle_ocr_instance = PaddleOCR(use_angle_cls=True, lang='vi', use_gpu=False, show_log=False)
        print("PaddleOCR initialized successfully for Vietnamese.") # Use print for logs during init
        return paddle_ocr_instance
    except Exception as e:
        st.warning(f"Could not initialize PaddleOCR: {e}. PaddleOCR option will be disabled.")
        return None

@st.cache_resource # Cache the initialized LlamaParse object
def get_llama_parser():
    if LLAMAPARSE_API_KEY:
        try:
            parser_instance = LlamaParse(api_key=LLAMAPARSE_API_KEY, result_type="text")
            print("LlamaParse parser initialized successfully.")
            return parser_instance
        except Exception as e:
             st.warning(f"Could not initialize LlamaParse: {e}. LlamaParse option disabled.")
             return None
    else:
        # No need for a warning here, handled later in the UI logic
        # st.sidebar.warning("LlamaParse API Key not found. LlamaParse option disabled.")
        return None

# Initialize clients using the cached functions
paddle_ocr_instance = get_paddle_ocr()
llama_parser_instance = get_llama_parser()

# Configure OpenAI client (safe to do here)
if OPENAI_API_KEY:
    openai.api_key = OPENAI_API_KEY
else:
    # We'll show this warning in the UI instead of here
    # st.warning("OpenAI API Key not found. LLM extraction will not work.")
    pass


# --- Streamlit App UI ---

st.title("🧾 Vietnamese Receipt OCR & Data Extraction")
st.info("Configured for Vietnamese language receipts (using Tesseract 'vie', PaddleOCR 'vi').")

# Initialize session state (safe to do after set_page_config)
if 'ocr_text' not in st.session_state:
    st.session_state.ocr_text = None
if 'extracted_data' not in st.session_state:
    st.session_state.extracted_data = None
if 'file_processed' not in st.session_state:
    st.session_state.file_processed = False
if 'confirmed_data' not in st.session_state:
    st.session_state.confirmed_data = None

# --- Sidebar for Configuration ---
with st.sidebar:
    st.header("Configuration")
    uploaded_file = st.file_uploader("Upload Receipt (Image or PDF)", type=["png", "jpg", "jpeg", "pdf"])

    # Determine available OCR options based on successful initialization
    ocr_options = ['Tesseract (Local)'] # Tesseract is assumed available via apt-get
    if paddle_ocr_instance:
        ocr_options.insert(1, 'PaddleOCR (Local)')
    else:
        st.sidebar.warning("PaddleOCR option disabled (initialization failed).")

    if llama_parser_instance:
        ocr_options.append('LlamaParse (API)')
    elif LLAMAPARSE_API_KEY: # Key provided but init failed
         st.sidebar.warning("LlamaParse option disabled (initialization failed).")
    else: # Key not provided
         st.sidebar.info("LlamaParse option disabled (API key not configured).")


    if not ocr_options:
         st.sidebar.error("No OCR engines available!")
         ocr_method = None
    else:
         # Set default index carefully, e.g., default to Paddle if available, else Tesseract
         default_ocr_index = 0
         if 'PaddleOCR (Local)' in ocr_options:
              default_ocr_index = ocr_options.index('PaddleOCR (Local)')

         ocr_method = st.radio(
             "Choose OCR Method:",
             options=ocr_options,
             index=default_ocr_index,
             help="Select the engine to extract text. Tesseract & Paddle configured for Vietnamese."
         )

    run_ocr = st.button("1. Run OCR", disabled=(uploaded_file is None or ocr_method is None))

    st.markdown("---")
    # Disable extraction button if no OCR text OR if OpenAI key is missing
    openai_ready = bool(OPENAI_API_KEY)
    run_extraction = st.button("2. Extract Data with LLM",
                               disabled=(st.session_state.ocr_text is None or not openai_ready))
    if not openai_ready:
         st.sidebar.error("OpenAI API Key not configured. LLM Extraction disabled.")


    st.markdown("---")
    # Google Sheet Config Info & Warnings
    st.header("Google Sheet Export")
    st.info(f"Sheet Name: {GOOGLE_SHEET_NAME or 'Not Set'}")
    st.info(f"Credentials: {GOOGLE_CREDS_PATH or 'Not Set'}")
    creds_file_exists = os.path.exists(GOOGLE_CREDS_PATH) if GOOGLE_CREDS_PATH else False
    gsheet_ready_for_export = GOOGLE_SHEET_NAME and GOOGLE_CREDS_PATH and creds_file_exists
    if not gsheet_ready_for_export:
         warning_msg = "Export disabled: "
         if not GOOGLE_SHEET_NAME: warning_msg += "Sheet Name missing. "
         if not GOOGLE_CREDS_PATH: warning_msg += "Creds Path missing. "
         if GOOGLE_CREDS_PATH and not creds_file_exists: warning_msg += f"Creds file not found. "
         st.warning(warning_msg)


# --- Main Area ---
col1, col2 = st.columns(2)

with col1:
    st.subheader("Uploaded File & OCR Text")
    if uploaded_file is not None:
        file_bytes = uploaded_file.getvalue()
        file_name = uploaded_file.name
        file_type = uploaded_file.type

        if file_type.startswith("image"):
            try:
                st.image(file_bytes, caption="Uploaded Receipt Image", use_column_width=True)
            except Exception as img_e:
                st.warning(f"Could not display image preview: {img_e}")
        elif file_type == "application/pdf":
            st.info(f"Uploaded PDF: {file_name}.")

        if run_ocr and ocr_method:
            st.session_state.ocr_text = None
            st.session_state.extracted_data = None
            st.session_state.confirmed_data = None
            st.session_state.file_processed = True

            with st.spinner(f"Running {ocr_method} (Vietnamese)..."):
                if ocr_method == 'Tesseract (Local)':
                    if file_type.startswith("image"):
                        st.session_state.ocr_text = process_image_tesseract(file_bytes)
                    elif file_type == "application/pdf":
                        st.session_state.ocr_text = process_pdf_tesseract(file_bytes)
                elif ocr_method == 'PaddleOCR (Local)':
                    # Pass the cached instance
                    if file_type.startswith("image"):
                        st.session_state.ocr_text = process_image_paddle(file_bytes, paddle_ocr_instance)
                    elif file_type == "application/pdf":
                        st.session_state.ocr_text = process_pdf_paddle(file_bytes, paddle_ocr_instance)
                elif ocr_method == 'LlamaParse (API)':
                    # Pass the cached instance
                     st.session_state.ocr_text = process_file_llamaparse(file_bytes, file_name, llama_parser_instance)

            if st.session_state.ocr_text and st.session_state.ocr_text.strip():
                st.success("OCR Completed!")
            else:
                st.error("OCR failed or produced no text. Try a different OCR method or check the file.")
                st.session_state.ocr_text = None

    if st.session_state.ocr_text:
        with st.expander("Show OCR Text", expanded=False):
            st.text_area("OCR Output", st.session_state.ocr_text, height=300, key="ocr_output_area")
    elif st.session_state.file_processed and not st.session_state.ocr_text:
          st.warning("No OCR text was generated from the file.")


with col2:
    st.subheader("Extracted Data & Confirmation")

    if run_extraction and st.session_state.ocr_text:
        st.session_state.extracted_data = None
        st.session_state.confirmed_data = None
        with st.spinner("Calling LLM for data extraction (Vietnamese context)..."):
            st.session_state.extracted_data = extract_data_with_llm(st.session_state.ocr_text)

        if st.session_state.extracted_data:
             if "raw_llm_output" in st.session_state.extracted_data:
                 st.warning("LLM did not return valid JSON. Cannot populate form.")
                 with st.expander("Show Raw LLM Output"):
                     st.code(st.session_state.extracted_data.get("raw_llm_output", ""), language=None)
                 st.session_state.extracted_data = {}
             else:
                 st.success("Data Extraction Attempted by LLM.")
                 st.write("Review and edit the extracted data below:")
        else:
             st.error("LLM Data Extraction Failed.")

    # Check if data exists and is not the raw error dict
    if st.session_state.extracted_data and "raw_llm_output" not in st.session_state.extracted_data:
        with st.form("confirmation_form"):
            st.write("### Confirm Extracted Details")
            confirmed = {}

            # --- Form Fields ---
            c1, c2 = st.columns(2)
            with c1:
                confirmed['buyer_name'] = st.text_input("Buyer Name", value=st.session_state.extracted_data.get('buyer_name', ''))
                confirmed['buyer_address'] = st.text_area("Buyer Address", value=st.session_state.extracted_data.get('buyer_address', ''), height=100)
                confirmed['buyer_contact'] = st.text_input("Buyer Contact", value=st.session_state.extracted_data.get('buyer_contact', ''))
                # Date handling
                default_date_str = st.session_state.extracted_data.get('receipt_date', '')
                default_date = None
                if default_date_str:
                    try:
                        default_date = datetime.datetime.strptime(default_date_str, '%Y-%m-%d').date()
                    except ValueError:
                        st.warning(f"LLM date '{default_date_str}' not YYYY-MM-DD. Please verify.")
                        # Attempt common formats if needed, or leave blank for manual input
                confirmed['receipt_date'] = st.date_input("Receipt Date", value=default_date)

            with c2:
                confirmed['store_name'] = st.text_input("Store Name", value=st.session_state.extracted_data.get('store_name', ''))
                confirmed['store_address'] = st.text_area("Store Address", value=st.session_state.extracted_data.get('store_address', ''), height=100)
                # Total Amount handling
                default_total_val = st.session_state.extracted_data.get('total_amount') # Keep as is from JSON initially
                default_total_float = 0.0
                if default_total_val is not None:
                    try:
                        default_total_float = float(default_total_val)
                    except (ValueError, TypeError):
                         st.warning(f"Could not parse total amount: '{default_total_val}'. Defaulting to 0.0.")
                confirmed['total_amount'] = st.number_input("Total Amount (VND)", value=default_total_float, format="%.0f", step=1.0)


            st.write("### Items Purchased")
            items_list = st.session_state.extracted_data.get('items', [])
            if not isinstance(items_list, list):
                 st.warning(f"Items data is not a list (found {type(items_list)}). Displaying empty editor.")
                 items_list = []

            try:
                items_df = pd.DataFrame(items_list)
            except Exception as df_err: # Broad exception for complex DataFrame init issues
                st.error(f"Could not create DataFrame from items list: {df_err}. Items might be malformed.")
                items_df = pd.DataFrame(columns=['description', 'quantity', 'price'])

            required_cols = ['description', 'quantity', 'price']
            for col in required_cols:
                if col not in items_df.columns:
                    items_df[col] = pd.NA # Use pandas NA for missing values

            # Coerce to appropriate types, handling potential errors
            items_df['description'] = items_df['description'].astype(str).fillna('')
            items_df['quantity'] = pd.to_numeric(items_df['quantity'], errors='coerce').fillna(0).astype(int)
            items_df['price'] = pd.to_numeric(items_df['price'], errors='coerce').fillna(0.0).astype(float)

            # Ensure column order
            items_df = items_df[required_cols]

            edited_items_df = st.data_editor(
                items_df,
                num_rows="dynamic",
                column_config={
                     "quantity": st.column_config.NumberColumn("Quantity", format="%d", step=1),
                     "price": st.column_config.NumberColumn("Price (VND)", format="%.0f", step=1.0),
                     "description": st.column_config.TextColumn("Description", width="large")
                },
                key="items_editor",
                use_container_width=True # Make editor wider
             )
            confirmed['items'] = edited_items_df.to_dict('records')

            # Final data prep before saving
            if confirmed.get('receipt_date'):
                confirmed['receipt_date'] = confirmed['receipt_date'].strftime('%Y-%m-%d')
            else:
                 confirmed['receipt_date'] = ''

            # Ensure total amount is stored appropriately (e.g., as number or string if GSheet prefers)
            # Keeping it numeric as obtained from number_input is usually fine for gspread
            # confirmed['total_amount'] = confirmed.get('total_amount', 0.0)


            # --- Form Submission ---
            submit_button = st.form_submit_button(
                 "Confirm & Save to Google Sheet",
                 disabled=not gsheet_ready_for_export # Use the flag calculated earlier
             )

            if submit_button:
                st.session_state.confirmed_data = confirmed
                st.write("Data confirmed. Attempting to save to Google Sheet...")

                sheet, client = connect_to_gsheet()
                if sheet:
                    with st.spinner("Appending data to Google Sheet..."):
                        success = append_to_gsheet(sheet, st.session_state.confirmed_data)
                        if success:
                            st.success(f"Data successfully appended to Google Sheet '{GOOGLE_SHEET_NAME}'!")
                            # Consider clearing state or using st.rerun() here if desired
                        else:
                            st.error("Failed to save data to Google Sheet. Check previous error messages.")
                else:
                    st.error("Cannot save data. Google Sheet connection failed. Check configuration and permissions.")

    elif run_extraction and not st.session_state.ocr_text:
         st.warning("Please run OCR first to generate text for extraction.")

Writing app.py


In [5]:
# Install dependencies

from pyngrok import ngrok

# If you have an authtoken, uncomment and set it:

ngrok.set_auth_token(NGROK_AUTH_TOKEN)
# Start Streamlit in background
import subprocess
import threading

def run_streamlit():
    subprocess.run(['streamlit', 'run', '--server.port', '8501', 'app.py'])

thread = threading.Thread(target=run_streamlit, daemon=True)
thread.start()

# Set up ngrok tunnel
public_url = ngrok.connect(addr='8501', proto='http')
print("Your Streamlit app is available at:", public_url)

# Keep the Colab runtime alive
import time
while True:
    time.sleep(10)

Your Streamlit app is available at: NgrokTunnel: "https://5bbb-34-48-54-26.ngrok-free.app" -> "http://localhost:8501"


KeyboardInterrupt: 