In [1]:
#!pip install openai
#!pip install dotenv

#### This code read file column wise, which leads to problems where some columns have more rows that others

In [32]:
import pandas as pd
import os
import base64
from typing import List
from openai import OpenAI
from pydantic import BaseModel

from dotenv import load_dotenv
HOME_DIR = os.path.expanduser("~")
load_dotenv(f"{HOME_DIR}/.env")

# Step 1: Define the structured response model
class TableData(BaseModel):
    Country: List[str]
    OrgName: List[str]
    OrgType: List[str]
    Description: List[str]
    Amount: List[str]


# Step 2: Load API client
def get_openai_client() -> OpenAI:
    return OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Step 3: Read and encode image to base64
def encode_image_to_base64(image_path: str) -> str:
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode("utf-8")

# Step 4: Build the message for GPT
def build_vision_prompt(prompt: str, base64_image: str) -> list:
    return [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{base64_image}"
                    }
                }
            ]
        }
    ]
# Step 5: Define the main prompt text
def get_default_prompt() -> str:
    prompt = """
        You are a data extraction assistant. Your task is to extract structured tabular data from an image of a table.
        The image contains rows with the following five columns:
            - Country
            - Partner Organization Name
            - Organization Type (e.g., NGO, Government)
            - Project Description
            - Amount (USD)
        
        Follow these instructions:
        - Read the entire table from the image.
        - Accurately extract the text in each row into a list.
        - Make sure that all columns have thesame number of rows.
        - Remove any artifacts, duplicated text, or OCR errors.
        - If any data is unclear or incomplete, make a best-effort inference and mark it with [inferred]. 
        - Do not guess numerical values.
        - Only return the JSON object matching the format.
    """
    return prompt


# Step 6: Make structured GPT call
def parse_table_data_from_image(image_path: str, model_name="gpt-4o") -> TableData:
    client = get_openai_client() 
    base64_img = encode_image_to_base64(image_path)
    prompt = get_default_prompt()
    messages = build_vision_prompt(prompt, base64_img)

    completion = client.beta.chat.completions.parse(
        model=model_name,
        messages=messages,
        temperature=0,
        response_format=TableData
    )

    return completion.choices[0].message.parsed


In [41]:
i=3
image_path = f"../data/unfpa_partners/p{i}.png"
response = parse_table_data_from_image(image_path)

In [42]:
table_dict = response.model_dump()
columns = [
    "Country",
    "OrgName",
    "OrgType",
    "Description",
    "Amount"
]
df = pd.DataFrame(table_dict)
#df.columns = columns
df

ValueError: All arrays must be of the same length

In [38]:
df.to_csv(f'../data/unfpa_partners/p{i}.csv', index=False)