In [3]:
!unzip icdar2003.zip

Archive:  icdar2003.zip
   creating: SceneTrialTrain/apanar_06.08.2002/
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1247.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1252.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1253.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1255.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1259.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1261.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1263.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1265.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1269.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1281.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1282.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1283.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1284.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1285.JPG  
  inflating: SceneTrialTrain/apanar_06.08.2002/IMG_1286.JPG 

In [4]:
import os
import random
import time
import xml.etree.ElementTree as ET
import shutil
import yaml

import cv2
import matplotlib.pyplot as plt
import numpy as np
import timm
import torch
import torch.nn as nn
import torchvision
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import ultralytics
from ultralytics import YOLO

  from .autonotebook import tqdm as notebook_tqdm


Creating new Ultralytics Settings v0.0.6 file ‚úÖ 
View Ultralytics Settings with 'yolo settings' or at '/home/nguyen-ngoc-dat/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [5]:
ultralytics.checks()

Ultralytics 8.3.235 üöÄ Python-3.10.19 torch-2.9.1+cu128 CUDA:0 (NVIDIA RTX 5000 Ada Generation, 32220MiB)
Setup complete ‚úÖ (32 CPUs, 125.6 GB RAM, 842.7/3753.4 GB disk)


In [6]:
def extract_data_from_xml(root_dir):
    xml_path = os.path.join(root_dir, "words.xml")
    tree = ET.parse(xml_path)
    root = tree.getroot()
    
    img_paths, img_sizes, img_labels, bboxes = [], [], [], []

    for img in root:
        bbs_of_img = []
        labels_of_img = []

        for bbs in img.findall("taggedRectangles"):
            for bb in bbs:
                if not bb[0].text.isalnum():
                    continue
                bbs_of_img.append([float(bb.attrib["x"]),
                                   float(bb.attrib["y"]),
                                   float(bb.attrib["width"]),
                                   float(bb.attrib["height"])])
                labels_of_img.append(bb[0].text.lower())
        
        img_path = os.path.join(root_dir, img[0].text)
        img_paths.append(img_path)
        img_sizes.append((int(img[1].attrib["x"]), int(img[1].attrib["y"])))
        bboxes.append(bbs_of_img)
        img_labels.append(labels_of_img)

    return img_paths, img_sizes, img_labels, bboxes

datset_dir = "SceneTrialTrain"
img_paths, img_sizes, img_labels, bboxes = extract_data_from_xml(datset_dir)

In [7]:
def convert_to_yolo_format(image_paths, image_sizes, bounding_boxes):
    yolo_data = []

    for image_path, image_size, bboxes in zip(image_paths, image_sizes, bounding_boxes):
        image_width, image_height = image_size
        yolo_labels = []

        for bbox in bboxes:
            x, y, width, height = bbox

            center_x = (x + width/2) / image_width
            center_y = (y + height/2) / image_height
            w = width / image_width
            h = height / image_height

            class_id = 0
            yolo_label = f"{class_id} {center_x} {center_y} {w} {h}"
            yolo_labels.append(yolo_label)
        yolo_data.append((image_path, yolo_labels))
    return yolo_data

class_label = ["text"]
yolo_data = convert_to_yolo_format(img_paths, img_sizes, bboxes)

In [8]:
def save_data(data, src_img_dir, save_dir):
    os.makedirs(save_dir, exist_ok=True)

    os.makedirs(os.path.join(save_dir, "images"), exist_ok=True)
    os.makedirs(os.path.join(save_dir, "labels"), exist_ok=True)

    for image_path, yolo_labels in data:
        shutil.copy(
            os.path.join(src_img_dir, image_path), os.path.join(save_dir, "images")
        )
        image_name = os.path.basename(image_path)
        image_name = os.path.splitext(image_name)[0]

        with open(os.path.join(save_dir, "labels", f"{image_name}.txt"), "w") as f:
            for label in yolo_labels:
                f.write(f"{label}\n")

In [9]:
val_size = 0.2
test_size = 0.125

train_data, test_data = train_test_split(
    yolo_data,
    test_size=val_size,
    shuffle=True
)

test_data, val_data = train_test_split(
    test_data,
    test_size=test_size,
    shuffle=True
)

In [12]:
len(train_data), len(test_data), len(val_data)

(200, 43, 7)

In [10]:
save_yolo_data_dir = "yolo_data"
os.makedirs(save_yolo_data_dir, exist_ok=True)

save_train_dir = os.path.join(save_yolo_data_dir, "train")
save_val_dir = os.path.join(save_yolo_data_dir, "val")
save_test_dir = os.path.join(save_yolo_data_dir, "test")

save_data(train_data, "", save_train_dir)
save_data(val_data, "", save_val_dir)
save_data(test_data, "", save_test_dir)

In [11]:
data_yaml = {
    "path": "yolo_data",
    "train": "train/images",
    "val": "val/images",
    "test": "test/images",
    "nc": 1,
    "names": class_label,
}

yolo_yaml_path = os.path.join(save_yolo_data_dir, "data.yaml")
with open(yolo_yaml_path, "w") as f:
    yaml.dump(data_yaml, f, default_flow_style=False)

In [13]:
model = YOLO("yolo11m.pt")

results = model.train(data="yolo_data/data.yaml", epochs=100, 
                      imgsz=640, cache=True,
                      patience=20,
                      plots=True, batch=32)

[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11m.pt to 'yolo11m.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 38.8MB 36.7MB/s 1.1s1.0s<0.1s
Ultralytics 8.3.235 üöÄ Python-3.10.19 torch-2.9.1+cu128 CUDA:0 (NVIDIA RTX 5000 Ada Generation, 32220MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=32, bgr=0.0, box=7.5, cache=True, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=yolo_data/data.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, m

In [16]:
model_path = "runs/detect/train/weights/best.pt"
model = YOLO(model_path)
metrics = model.val()

Ultralytics 8.3.235 üöÄ Python-3.10.19 torch-2.9.1+cu128 CUDA:0 (NVIDIA RTX 5000 Ada Generation, 32220MiB)
YOLO11m summary (fused): 125 layers, 20,030,803 parameters, 0 gradients, 67.6 GFLOPs
[34m[1mval: [0mFast image access ‚úÖ (ping: 0.0¬±0.0 ms, read: 3247.8¬±1522.8 MB/s, size: 120.5 KB)
[K[34m[1mval: [0mScanning /home/nguyen-ngoc-dat/Desktop/scene_text_recognition/yolo_data/val/labels.cache... 7 images, 1 backgrounds, 0 corrupt: 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 7/7 15.1Kit/s 0.0ss
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 1/1 7.5it/s 0.1s
                   all          7         25      0.686      0.611       0.64      0.401
Speed: 0.4ms preprocess, 7.2ms inference, 0.0ms loss, 0.6ms postprocess per image
Results saved to [1m/home/nguyen-ngoc-dat/Desktop/scene_text_recognition/runs/detect/val[0m


In [18]:
metrics

ultralytics.utils.metrics.DetMetrics object with attributes:

ap_class_index: array([0])
box: ultralytics.utils.metrics.Metric object
confusion_matrix: <ultralytics.utils.metrics.ConfusionMatrix object at 0x7cc5dd446b60>
curves: ['Precision-Recall(B)', 'F1-Confidence(B)', 'Precision-Confidence(B)', 'Recall-Confidence(B)']
curves_results: [[array([          0,    0.001001,    0.002002,    0.003003,    0.004004,    0.005005,    0.006006,    0.007007,    0.008008,    0.009009,     0.01001,    0.011011,    0.012012,    0.013013,    0.014014,    0.015015,    0.016016,    0.017017,    0.018018,    0.019019,     0.02002,    0.021021,    0.022022,    0.023023,
          0.024024,    0.025025,    0.026026,    0.027027,    0.028028,    0.029029,     0.03003,    0.031031,    0.032032,    0.033033,    0.034034,    0.035035,    0.036036,    0.037037,    0.038038,    0.039039,     0.04004,    0.041041,    0.042042,    0.043043,    0.044044,    0.045045,    0.046046,    0.047047,
          0.048048, 