In [1]:
import numpy as np
import cv2
import numpy as np
from PIL import Image
import pytesseract

In [2]:
import ollama
import base64
import os

# IMPORTANT: Replace with the IP address and port of your Ollama instance
OLLAMA_HOST = 'http://192.168.178.39:11434' 
# Example for a network device: 'http://192.168.1.100:11434'

# The vision model you have pulled in Ollama
MODEL = 'qwen2.5vl:7b'

# The custom prompt you want to ask
PROMPT = """You will receive a image of a book front view in a book shelf. 
    Extract the author and title of the book image. 
    Return the output as JSON. Output the JSON as {'author': .. , 'title': .., 'language': ..}
    Do not mistake the genre or publisher as the author. 
    If you cannot extrac the author from the image, output the author that has published the book that you know of.
    """


def ocr(
    ollama_host: str,
    model_name: str,
    prompt: str,
    image_path: str
):
    """
    Sends a prompt and an image to a local Ollama vision model and prints the response.

    Args:
        ollama_host (str): The URL of the Ollama server (e.g., 'http://192.168.1.100:11434').
        model_name (str): The name of the vision model to use (e.g., 'qwen2.5vl:7b').
        prompt (str): The text prompt to send with the image.
        image_path (str): The local path to the image file.
    """
    try:
        # 1. Initialize the Ollama client to connect to your home network instance
        client = ollama.Client(host=ollama_host)
        print(f"Successfully connected to Ollama at {ollama_host}")

        # 2. Check if the image file exists
        if not os.path.exists(image_path):
            print(f"Error: Image file not found at '{image_path}'")
            return

        # 3. Encode the image to base64
        with open(image_path, "rb") as image_file:
            encoded_image = base64.b64encode(image_file.read()).decode("utf-8")
        
        print(f"Image '{image_path}' encoded successfully.")

        # 4. Send the chat request with the image
        print("\nSending prompt to model... (this may take a moment)")
        response = client.chat(
            model=model_name,
            messages=[
                {
                    'role': 'user',
                    'content': prompt,
                    'images': [encoded_image]  # The image is passed here
                }
            ],
            stream=False # Use stream=True for a chatbot-like streaming response
        )

        # 5. Print the model's response
        print("\n--- Model Response ---")
        print(response['message']['content'])
        print("----------------------\n")

    except Exception as e:
        print(f"\nAn error occurred: {e}")
        print("Please check the following:")
        print("- Is the Ollama server running and accessible at the specified host?")
        print(f"- Have you pulled the model '{model_name}'? (ollama pull {model_name})")
        print("- Is the host URL correct? (e.g., 'http://<ip_address>:<port>')")

In [3]:
from ultralytics import YOLO
from ultralytics import SAM
from ultralytics import FastSAM
from ultralytics.data.annotator import auto_annotate# Load a pretrained YOLO11n model
sam_model = SAM("sam2.1_b.pt")

In [4]:
import easyocr
reader = easyocr.Reader(['de', 'en']) # this needs to run only once to load the model into memory

In [5]:
yolo_model = YOLO("yolov10x.pt")  # Replace with your YOLO model
book_class_index = 73  # Replace with the actual class index for "book"

In [21]:
image_path = "images/book2.jpeg"

In [24]:
img = np.array(Image.open(image_path))
gray_img = Image.fromarray(img).convert("RGB")
img = np.array(gray_img)

In [29]:
det_results = yolo_model(image_path)
book_boxes = []
for r in det_results:
    print(r)
    for box, cls in zip(r.boxes.xyxy, r.boxes.cls):
        if int(cls) == book_class_index:
            book_boxes.append(box.tolist())

print(f"found {len(book_boxes)} books")


image 1/1 /Users/michaelzemler/Documents/bookhawk/images/book2.jpeg: 480x640 19 books, 505.2ms
Speed: 7.7ms preprocess, 505.2ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)
ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45:

In [9]:
for i, bbox in enumerate(book_boxes):
    # Run SAM segmentation on the bbox
    results = sam_model(img, bboxes=[bbox])
    result = results[0]
    
    result.show()

    original_image = Image.open(image_path).convert("RGB")

    masks = results[0].masks.data.cpu().numpy()
    mask_raw = masks[0]
    x,y,w,h = cv2.boundingRect(mask_raw.astype(np.uint8))

    cropped_mask = mask_raw[y:y+h, x:x+w]
    cropped_image = np.array(original_image)[y:y+h, x:x+w]

    segmented_image = np.zeros((h,w,4), dtype=np.uint8)
    segmented_image[:,:, :3] = cropped_image

    segmented_image[:,:,3] = cropped_mask * 255

    image_name_to_save = f"test_{i}.png"
    Image.fromarray(segmented_image).save(image_name_to_save)

    ocr(
        ollama_host=OLLAMA_HOST,
        model_name=MODEL,
        prompt=PROMPT,
        image_path= image_name_to_save
    )




0: 1024x1024 1 0, 1838.4ms
Speed: 24.7ms preprocess, 1838.4ms inference, 15.5ms postprocess per image at shape (1, 3, 1024, 1024)
Successfully connected to Ollama at http://192.168.178.39:11434
Image 'test_0.png' encoded successfully.

Sending prompt to model... (this may take a moment)

--- Model Response ---
```json
{
  "author": "Kyle Mills",
  "title": "Das Galdiano Experiment",
  "language": "German"
}
```
----------------------


0: 1024x1024 1 0, 2371.6ms
Speed: 14.7ms preprocess, 2371.6ms inference, 27.6ms postprocess per image at shape (1, 3, 1024, 1024)
Successfully connected to Ollama at http://192.168.178.39:11434
Image 'test_1.png' encoded successfully.

Sending prompt to model... (this may take a moment)

--- Model Response ---
```json
{
  "author": "Eric L. Harry",
  "title": "Invasion",
  "language": "English"
}
```
----------------------


0: 1024x1024 1 0, 2419.4ms
Speed: 17.5ms preprocess, 2419.4ms inference, 19.0ms postprocess per image at shape (1, 3, 1024, 1024)
S

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File [35m"<string>"[0m, line [35m1[0m, in [35m<module>[0m
    import os, sys, time; [31mtime.sleep[0m[1;31m(20)[0m; os.remove(sys.argv[1])
                          [31m~~~~~~~~~~[0m[1;31m^^^^[0m
  File [35m"<string>"[0m, line [35m1[0m, in [35m<module>[0m
    import os, sys, time; [31mtime.sleep[0m[1;31m(20)[0m; os.remove(sys.argv[1])
                          [31m~~~~~~~~~~[0m[1;31m^^^^[0m
[1;35mKeyboardInterrupt[0m
  File [35m"<string>"[0m, line [35m1[0m, in [35m<module>[0m
    import os, sys, time; [31mtime.sleep[0m[1;31m(20)[0m; os.remove(sys.argv[1])
                          [31m~~~~~~~~~~[0m[1;31m^^^^[0m
[1;35mKeyboardInterrupt[0m
[1;35mKeyboardInterrupt[0m


KeyboardInterrupt: 