#### Installing libraries

In [None]:
!pip install opencv-python
!pip install numpy
!pip install pydantic
!pip install langgraph
!pip install langchain-community
!pip install langchain-anthropic
!pip install langgraph-checkpoint-sqlite
!pip install langchain[openai]

#### Importing libraries

In [None]:
import getpass
import os
import cv2 as cv
import numpy as np
import base64

from pydantic import BaseModel, Field
from langchain_core.messages import HumanMessage
from langchain_core.messages import SystemMessage
from langchain_openai import ChatOpenAI

#### Setting up environmental variables

In [None]:
# OpenAI
os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI:")

#### Loading and adjusting the image dimensions

In [None]:
image = cv.imread("example.jpg")
height, width, _ = image.shape

In [None]:
new_height = ((height // 32) + 1) * 32 
new_width = ((width // 32) + 1) * 32 

image_resized = cv.resize(image, (new_width, new_height))

#### Detecting text

In [None]:
mean_r, mean_g, mean_b, _ = cv.mean(image_resized)
mean = (mean_r, mean_g, mean_b)

bin_threshold = 0.3
poly_threshold = 0.5

In [None]:
textDetectorDB50 = cv.dnn_TextDetectionModel_DB("DB_TD500_resnet50.onnx")
textDetectorDB50.setBinaryThreshold(bin_threshold)
textDetectorDB50.setPolygonThreshold(poly_threshold)
textDetectorDB50.setInputParams(1.0/255, (new_width, new_height), mean, True)

boxes, confidences = textDetectorDB50.detect(image_resized)

#### Preparing mask

In [None]:
inpaint_mask_db50 = np.zeros(image_resized.shape[:2], dtype=np.uint8)

#### Merging close boxes

In [None]:
def boxes_distance(A_min, A_max, B_min, B_max):
    delta1 = A_min - B_max
    delta2 = B_min - A_max
    
    u = np.max(np.array([np.zeros(len(delta1)), delta1]), axis=0)
    v = np.max(np.array([np.zeros(len(delta2)), delta2]), axis=0)
    
    dist = np.linalg.norm(np.concatenate([u, v]))
    return dist

In [None]:
def merge_boxes(A_box, B_box):
    C_0 = [min(A_box[0][0], B_box[0][0]), max(A_box[0][1], B_box[0][1])] # Top-right point
    C_1 = [min(A_box[1][0], B_box[1][0]), min(A_box[1][1], B_box[1][1])] # Top-left point
    C_2 = [max(A_box[2][0], B_box[2][0]), min(A_box[2][1], B_box[2][1])] # Bottom-left point
    C_3 = [max(A_box[3][0], B_box[3][0]), max(A_box[3][1], B_box[3][1])] # Bottom-right point 

    
    return np.array([C_0, C_1, C_2, C_3])

In [None]:
min_distance = 10
boxes_refined = [box for box in boxes]

i = 0
while i < len(boxes_refined):
    j = i + 1
    merged = False

    A_min = boxes_refined[i][1]
    A_max = boxes_refined[i][3]

    while j < len(boxes_refined):
        B_min = boxes_refined[j][1]
        B_max = boxes_refined[j][3]

        distance = boxes_distance(A_min, A_max, B_min, B_max)
        
        if distance < min_distance:
            boxes_refined[i] = merge_boxes(boxes_refined[i], boxes_refined[j])
            boxes_refined.pop(j)
            
            merged = True
        else:
            j += 1
    if not merged:
        i += 1

#### Highlighting boxes on the image

In [None]:
image_refined_boxes = image_resized.copy()

color = (255, 0, 0)
thickness = 2

for box in boxes_refined:
    cv.fillPoly(inpaint_mask_db50, [np.array(box, np.int32)], 255)
    cv.polylines(image_refined_boxes, [np.array(box, np.int32)], isClosed=True, color=color, thickness=thickness)

#### Assigning a number to each box

In [None]:
image_numbered = image_refined_boxes.copy()

color = (0, 0, 255)
thickness = 2
font = cv.FONT_HERSHEY_SIMPLEX
font_scale = 0.5

x_offset = -10
y_offset = -5

for i in range(len(boxes_refined)):
    cv.putText(image_numbered, str(i), (boxes_refined[i][1][0] + x_offset, boxes_refined[i][1][1] + y_offset), font, font_scale, color, thickness)

#### Creating the output structure

In [None]:
class boxes_list(BaseModel):
    boxes_ids: list[int] = Field(description="The ids of text boxes containing the exercise descriptions")

#### Creating an agent

In [None]:
model = ChatOpenAI(model="gpt-4.1", temperature=0)
structured_model = model.with_structured_output(boxes_list)

#### Calling the agent to choose boxes

In [None]:
_, buffer = cv.imencode(".jpg", image_numbered)
image_bytes = buffer.tobytes()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")

In [None]:
system_message = SystemMessage(content = "You will be provided with an image of the page from the textbook for learning Luxembourgish.\
                                          The text on the image will be highlighted by the boxes and assigned a number.\
                                          Your task is to return the numbers of those boxes, which contain the exercise for the student.\
                                          Each exercise follow a specific convention - they start from an exercise number, followed by description.")

In [None]:
user_message = HumanMessage(content = [{
                                    "type": "image", "source_type": "base64",
                                     "mime_type": "image/jpeg", "data": image_base64
                                }]
                           )

In [None]:
config = {"configurable": {"thread_id": "1"}}

response = structured_model.invoke([
    system_message,
    user_message
])

#### Cropping the image

In [None]:
cropped_images = []

initial_offset = 10

boxes_ids = response.boxes_ids.copy()

for i in range(len(boxes_ids)):
    upper_bound = boxes_refined[boxes_ids[i]][1][1]
    
    if (upper_bound - initial_offset) > 0:
        upper_bound -= initial_offset
        
    if i == len(boxes_ids) - 1:
        lower_bound = new_height
    else:
        lower_bound = boxes_refined[boxes_ids[i + 1]][1][1]

    cropped_image = image_resized[upper_bound:lower_bound, 0:new_width]
    
    cropped_images.append(cropped_image)

#### Adjusting the cropped images

In [None]:
left_offset = 15
right_offset = 40
top_offset = 0
bottom_offset = 10

for i in range(len(cropped_images)):
    cropped_height, cropped_width, _ = cropped_images[i].shape
    cropped_images[i] = cropped_images[i][0 + top_offset: cropped_height - bottom_offset, 0 + left_offset:cropped_width - right_offset]

#### Saving the results

In [None]:
for i in range(len(cropped_images)):
    cv.imwrite(f"cropped_images/cropped_image_{i}.jpg", cropped_images[i])