In [24]:
import json
from pathlib import Path
import base64
from getpass import getpass

from IPython.display import Image
import instructor
from openai import OpenAI
import pandas as pd
from pydantic import BaseModel
from datetime import date
import os
from dotenv import load_dotenv

load_dotenv()
openai_key = os.getenv("OPENAI_API_KEY")
openai_client = OpenAI(api_key=openai_key)


In [20]:
RAW_DATA_PATH = Path("dane_gaz") / "raw"

PROCESSED_DATA_PATH = Path("dane_gaz") / "processed"

In [25]:
RAW_DATA_PATH = Path("zad_dom") / "raw_inv"
PROCESSED_DATA_PATH = Path("zad_dom") / "processed_inv"

In [26]:
for image_path in RAW_DATA_PATH.glob("*.png"):
    print(image_path)

zad_dom\raw_inv\zad_domowe__invoice_1.png
zad_dom\raw_inv\zad_domowe__invoice_10.png
zad_dom\raw_inv\zad_domowe__invoice_2.png
zad_dom\raw_inv\zad_domowe__invoice_3.png
zad_dom\raw_inv\zad_domowe__invoice_4.png
zad_dom\raw_inv\zad_domowe__invoice_5.png
zad_dom\raw_inv\zad_domowe__invoice_6.png
zad_dom\raw_inv\zad_domowe__invoice_7.png
zad_dom\raw_inv\zad_domowe__invoice_8.png
zad_dom\raw_inv\zad_domowe__invoice_9.png


In [None]:
Image(RAW_DATA_PATH / "gaz_2023_12.png")

In [16]:
def prepare_image_for_open_ai(image_path: Path) -> bytes:
    with open(image_path, 'rb') as f:
        image_data = base64.b64encode(f.read()).decode('utf-8')

    return f"data:image/png;base64,{image_data}"


In [None]:
for image_path in RAW_DATA_PATH.glob('*.png'):
    print(f"Precessing {image_path}")

    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": """
wyciągnij wszystkie informacje zawarte na fakturze.
Dane przedstaw w formacie JSON.
Oczekuję następujących informacji:
{
    "razem_sprzedaż_okres_rozliczeniowy_data_od": ...,
    "razem_sprzedaż_okres_rozliczeniowy_data_do": ...,
    "zużycie_m3": ...,
    "zużycie_kWh": ...,
    "do_zapłaty": ...,
    "termin_płatności": ...,
}
tylko dane jako JSON, bez żadnych komentarzy.
"""
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": prepare_image_for_open_ai(image_path),
                            "detail": "high"
                        },
                    },
                ],
            }
        ],
    )

    result = response.choices[0].message.content.replace("```json", "").replace("```","").strip()
    with open(PROCESSED_DATA_PATH / f"{image_path.stem}__simple.json", "w") as f:
        f.write(result)


In [None]:
class GasBillInfo(BaseModel):
    okres_rozliczeniowy_od: date
    okres_rozliczeniowy_do: date
    zuzycie_m3: float
    zuzycie_kWh: float
    do_zaplaty: float
    termin_platnosci: date


instructor_openai_client = instructor.from_openai(OpenAI(api_key=api_key))

In [None]:
for image_path in sorted(RAW_DATA_PATH.glob('*.png')):
    print(f"Precessing {image_path}")

    gas_bill = instructor_openai_client.chat.completions.create(
        model ="gpt-4o-mini",
        response_model=GasBillInfo,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Pobierz szczegóły rachunku za gaz",
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": prepare_image_for_open_ai(image_path),
                            "detail": "high",
                        },
                    },
                    
                ],
            },
        ],
    )
    with open(PROCESSED_DATA_PATH / f"{image_path.stem}.json", "w") as f:
        f.write(gas_bill.model_dump_json())

In [None]:
data = []
for json_path in PROCESSED_DATA_PATH.glob('*.json'):
    if "simple" in json_path.name:
        continue

    with open(json_path) as f:
        data.append(json.loads(f.read()))
        
df = pd.DataFrame(data)
df

In [None]:
df.sort_values("termin_platnosci").plot(x="termin_platnosci", y="do_zaplaty", kind="bar")