In [5]:
from PIL import Image, ImageDraw
import base64
from io import BytesIO

def convert_to_base64(pil_image: Image):
    buffered = BytesIO()
    pil_image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str

def load_image(image_path: str):
    pil_image = Image.open(image_path)
    image_b64 = convert_to_base64(pil_image)
    print("Loaded image successfully!")
    return image_b64

In [6]:
image_path = "/Users/kaewsai/Downloads/seller-center-home-en-30.jpeg"
prompt = """Detect the location of "Create account/Login" text in this image and provide its normalized bounding box coordinates.
No preamble or explanation is needed, just the coordinate."""
image_b64 = load_image(image_path)
pil_image = Image.open(image_path)

Loaded image successfully!


In [14]:
from langchain_community.llms import Ollama

llava = Ollama(model="llava:13b")
resp = llava.invoke(prompt, images=[image_b64])
print(resp)

 [0.376,0.492,0.518,0.696]


In [16]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

resp = llm.invoke(
             [HumanMessage(
             content=[
             {"type": "text", "text": prompt},
             {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}},
             ])]
             )
print(resp.content)

0.55, 0.23, 0.67, 0.47


In [20]:
crop = pil_image.crop([0,0,300,30])
crop.show()

In [21]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-printed')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-printed')
pixel_values = processor(images=crop.convert("RGB"), return_tensors="pt").pixel_values

generated_ids = model.generate(pixel_values)
generated_ocr = processor.batch_decode(generated_ids, skip_special_tokens=True)
print(generated_ocr)


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['NOCNOC']


In [17]:
predictions = [0.55, 0.23, 0.67, 0.47]

x0 = predictions[0]
x1 = predictions[2]
y0 = predictions[1]
y1 = predictions[3]

with Image.open(image_path) as im:
    width, height = im.size
    draw = ImageDraw.Draw(im)
    draw.rectangle(((x0*width, y0*height),(x1*width, y1*height)), outline='Red')
    # draw.rectangle(((1150, 30),(1350, 90)), outline='Red')
    im.show()