In [None]:
import base64
import json
from typing import List
from pydantic import BaseModel, Field
from openai import OpenAI
from IPython.display import Markdown

class Transcript(BaseModel):
    text: str = Field(..., description="Markdown version of the text.")
    comments: str = Field(..., description="Comments about the transcription.")
    corrections: List[str] = Field(
        ..., description="List of corrections made to the text."
    )

    class Config:
        title = "Transcript Model"
        description = "A model representing a transcribed text, including comments and corrections."


instructions = """
Your role is to take images of text from books, convert them into markdown format enclosed within a markdown code block, and list any corrections made during the process.
Include all relevant text.
Line breaks within paragraphs should be replaced with spaces, combining words split by hyphens. 
Ensure that all tables are recreated accurately in markdown.
Include all photo captions.
Use '*' for italics, '#' and '##' for headers, avoid smart quotes, and use '---' with a space on each side for em dashes.
Combine texts if split across multiple columns or images, especially if it splits a word, sentence, or paragraph.
You should also correct any likely errors in the text, emphasizing accuracy in text recognition and markdown formatting, and being cautious about altering the original language of the text. 
If you encounter any words that are not clear due to the quality of the image, make a best guess and annotate it with a question mark in brackets [?]
When converting headlines in all caps, it should replace them with title case. 
Review your work carefully. This work will be widely reproduced so accuracy is incredibly important. 
You should communicate in a helpful and precise manner, effectively meeting the user's needs for text conversion, error correction, and providing a summary of any corrections made.
"""


def encode_image(image_path: str) -> str:
    """Encode an image to a base64 string."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


def process_image_to_text(image_path: str):
    """Process an image to extract text and convert it to markdown format."""
    base64_image = encode_image(image_path)

    client = OpenAI()

    response = client.chat.completions.create(
        model="gpt-4o",
        functions=[
            {
                "name": "image_to_markdown",
                "description": "Process image text to Markdown.",
                "parameters": Transcript.model_json_schema(),
            }
        ],
        messages=[
            {
                "role": "user",
                "content": [
                {"type": "text", "text": instructions},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                    },
                },
            ],
            },
        ],
        max_tokens=3300,
    )
    return json.loads(response.choices[0].message.function_call.arguments)


# Example usage
image_path = "test_page_001.png"
result = process_image_to_text(image_path)
result