# Importações

In [13]:
!pip install -q numpy==2.0.1
!pip install -q torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1

In [14]:
!pip install -q pytorch-lightning==2.5.0 lmdb fire hydra-core imgaug==0.4.0
!pip install -q pyclipper pyyaml rapidfuzz gdown einops timm

In [15]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pathlib
import PIL
import sys
import torch
import torchvision
import tqdm
import os

print("NumPy:", np.__version__)
print("Torch:", torch.__version__)

NumPy: 2.0.2
Torch: 2.4.1+cu121


# Obtendo dataset

### Clonando dataset

In [16]:
# URL do repositório
dataset_url = "https://github.com/lucas-azdias/Artistic-Text-Recognition-Dataset.git"
dataset_path = pathlib.Path("/content/WordArt-V1.5")

# Clonar o repositório
!rm -rf "{dataset_path}"
!git clone "{dataset_url}" "{dataset_path}"

Cloning into '/content/WordArt-V1.5'...
remote: Enumerating objects: 12087, done.[K
remote: Counting objects: 100% (77/77), done.[K
remote: Compressing objects: 100% (77/77), done.[K
remote: Total 12087 (delta 24), reused 11 (delta 0), pack-reused 12010 (from 1)[K
Receiving objects: 100% (12087/12087), 1.29 GiB | 41.86 MiB/s, done.
Resolving deltas: 100% (24/24), done.
Updating files: 100% (12011/12011), done.


### Carregando dataset

In [17]:
def load_data(base_path: pathlib.Path, name: str) -> tuple[list[pathlib.Path], list[str]]:
    labels_path = pathlib.Path(base_path, name, "labels.txt")

    image_paths = []
    classes = []

    with open(labels_path, "r") as f:
        for line in f:
            # Ex: "train_image\320.png Ford"
            splitted = line.strip().split(" ")
            path, label = splitted[0].replace("\\", "/"), " ".join(splitted[1:])

            # Ajustar o caminho para apontar para a pasta correta
            img_path = pathlib.Path(base_path, name, pathlib.Path(path))
            image_paths.append(img_path)
            classes.append(label)

    return image_paths, classes

train_images, train_labels = load_data(dataset_path, "train")
testA_images, testA_labels = load_data(dataset_path, "testA")

##Baixando o Parseq para usar tools

In [18]:
# URL do repositório
parseq_url = "https://github.com/baudm/parseq.git"
parseq_path = pathlib.Path("/content/parseq")

# Clonar o repositório
!rm -rf "{parseq_path}"
!git clone "{parseq_url}" "{parseq_path}"

if not str(parseq_url) in sys.path:
    sys.path.append(str(parseq_path.absolute()))

Cloning into '/content/parseq'...
remote: Enumerating objects: 612, done.[K
remote: Counting objects: 100% (311/311), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 612 (delta 243), reused 193 (delta 193), pack-reused 301 (from 2)[K
Receiving objects: 100% (612/612), 1.34 MiB | 19.07 MiB/s, done.
Resolving deltas: 100% (344/344), done.


# Resultados do modelo pré treinado

In [19]:
# URL do repositório
svtrv2_url = "https://github.com/Topdu/OpenOCR.git"
svtrv2_path = pathlib.Path("/content/svtrv2")

# Clonar o repositório
!rm -rf "{svtrv2_path}"
!git clone "{svtrv2_url}" "{svtrv2_path}"

if not str(svtrv2_path.absolute()) in sys.path:
    sys.path.append(str(svtrv2_path.absolute()))

Cloning into '/content/svtrv2'...
remote: Enumerating objects: 2587, done.[K
remote: Counting objects: 100% (796/796), done.[K
remote: Compressing objects: 100% (111/111), done.[K
remote: Total 2587 (delta 711), reused 685 (delta 685), pack-reused 1791 (from 1)[K
Receiving objects: 100% (2587/2587), 11.46 MiB | 40.46 MiB/s, done.
Resolving deltas: 100% (1685/1685), done.


In [20]:
!pip install -q pyclipper pyyaml rapidfuzz imgaug

In [21]:
!gdown -q --folder 11u11ptDzQ4BF9RRsOYdZnXl6ell2h4jN -O "{svtrv2_path}/svtrv2_model"
!rm -rf "{svtrv2_path}/output/rec/u14m_filter/svtrv2_smtr_gtc_rctc"
!mkdir -p "{svtrv2_path}/output/rec/u14m_filter/svtrv2_smtr_gtc_rctc"
!cp -r "{svtrv2_path}/svtrv2_model/." "{svtrv2_path}/output/rec/u14m_filter/svtrv2_smtr_gtc_rctc"

In [22]:
target_files = [
    svtrv2_path / "openrec" / "postprocess" / "ctc_postprocess.py",
    svtrv2_path / "openrec" / "postprocess" / "smtr_postprocess.py",
]

for target_file in target_files:
    # Read file
    text = target_file.read_text(encoding="utf-8")

    # Replace the line
    new_text = text.replace(
        "result_list.append((text, np.mean(conf_list).tolist()))",
        "result_list.append((text, conf_list))"
    )

    # Write back
    target_file.write_text(new_text, encoding="utf-8")

In [25]:
infer_imgs = f"{dataset_path}/testA/images"

!cd "{svtrv2_path}" && python "{svtrv2_path}/tools/infer_rec.py" --c "{svtrv2_path}/configs/rec/svtrv2/svtrv2_smtr_gtc_rctc_infer.yml" --o Global.infer_img="{infer_imgs}" Global.output_dir="./output/rec/u14m_filter/svtrv2_smtr_gtc_rctc" Global.pretrained_model="./output/rec/u14m_filter/svtrv2_smtr_gtc_rctc/best.pth" Global.checkpoints="null"

  checkpoint = torch.load(pretrained_model, map_location=torch.device("cpu"))
[2025/11/09 22:29:51] openrec INFO: finetune from checkpoint ./output/rec/u14m_filter/svtrv2_smtr_gtc_rctc/best.pth
  padded_batch[i, :, :h, :w] = img
[2025/11/09 22:29:51] openrec INFO: 0 /content/WordArt-V1.5/testA/images/new0.png	 result: RANCD	[0.99460405 0.9887272  0.97649175 0.5115175  0.8936286 ], time cost: 0.3056752681732178
[2025/11/09 22:29:51] openrec INFO: 1 /content/WordArt-V1.5/testA/images/new1.png	 result: GORiLLaZ	[0.9969067  0.8789303  0.9930025  0.69282544 0.9552821  0.9633114
 0.6523409  0.9159382 ], time cost: 0.016592025756835938
[2025/11/09 22:29:51] openrec INFO: 2 /content/WordArt-V1.5/testA/images/new10.png	 result: Republic	[0.99312407 0.9993826  0.9990816  0.99881315 0.997638   0.9975267
 0.99671173 0.9977538 ], time cost: 0.027545690536499023
[2025/11/09 22:29:51] openrec INFO: 3 /content/WordArt-V1.5/testA/images/new100.png	 result: Because	[0.9753176  0.9855516  0.9939866  0.97

In [26]:
with open(f"{svtrv2_path}/rec_results/rec_results.txt") as file:
    results = file.read().strip().replace("\n/", "\n//").split("\n/")

results = [line.strip().split("\t") for line in results]
results = [(path, pred, [float(c) for c in confidence.replace("\n", "").replace("[", "").replace("]", "").strip().split()]) for path, pred, confidence in results]

correct = 0
for img_path, label in zip(testA_images, testA_labels):
    for path, pred, _ in results:
        if pathlib.Path(img_path).stem == pathlib.Path(path).stem:
            correct += 1 if pred == label else 0
            break

print(f"{correct}/{len(results)} ({correct / len(results) * 100:.2f}%)")

2391/3000 (79.70%)


# Treinamento

In [27]:
!python /content/parseq/tools/create_lmdb_dataset.py \
    "/content/WordArt-V1.5/train" \
    "/content/WordArt-V1.5/train/labels.txt" \
    "/content/WordArt-V1.5/train/lmdb_dataset" \
    --checkValid=True

Written 1000 / 6000
Written 2000 / 6000
Written 3000 / 6000
Written 4000 / 6000
Written 5000 / 6000
Written 6000 / 6000
Created dataset with 6000 samples


In [28]:
!python /content/parseq/tools/create_lmdb_dataset.py \
    "/content/WordArt-V1.5/testA" \
    "/content/WordArt-V1.5/testA/labels.txt" \
    "/content/WordArt-V1.5/testA/lmdb_dataset" \
    --checkValid=True

Written 1000 / 3000
Written 2000 / 3000
Written 3000 / 3000
Created dataset with 3000 samples


In [29]:
!pip install -q "numpy<2.0"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m98.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jax 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
opencv-python 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.26.4 which is incompatible.
opencv-python-headless 4.12.0.88 requires numpy<2.3.0,>=2; python_version >= "3.9", but you have numpy 1.26.4 which is incompatible.
jaxlib 0.7.2 requires numpy>=2.0, but you have numpy 1.26.4 which is incompatible.
pytensor 2.35.1 requires numpy>=2.0, but you have numpy 1.26.4 w

In [30]:
# caminho completo do arquivo
config_path = f"{svtrv2_path}/configs/rec/svtrv2/svtrv2_smtr_gtc_rctc.yml"

# conteúdo novo do arquivo
new_content = """Global:
  device: gpu
  epoch_num: 50
  log_smooth_window: 20
  print_batch_step: 10
  output_dir: /content/svtrv2/output/rec/u14m_filter/svtrv2_smtr_gtc_rctc
  save_epoch_step: [10, 1]
  eval_batch_step: [0, 500]
  eval_epoch_step: [0, 1]
  cal_metric_during_train: True
  pretrained_model: /content/svtrv2/output/rec/u14m_filter/svtrv2_smtr_gtc_rctc/best.pth
  checkpoints: null
  use_tensorboard: false
  infer_img: /content/WordArt-V1.5/testA/images
  character_dict_path: &character_dict_path /content/svtrv2/tools/utils/EN_symbol_dict.txt
  max_text_length: &max_text_length 25
  use_space_char: &use_space_char False
  save_res_path: /content/svtrv2/output/rec/u14m_filter/predicts_svtrv2_smtr_gtc_rctc.txt
  use_amp: True
  early_stop:
    patience: 10
    metric: acc
    mode: max

Optimizer:
  name: AdamW
  lr: 0.00005
  weight_decay: 0.1
  filter_bias_and_bn: True

LRScheduler:
  name: CosineAnnealingLR
  T_max: 50
  eta_min: 0.00001
  warmup_epoch: 2
  warmup_start_lr: 0.000005
  cycle_momentum: False

Architecture:
  model_type: rec
  algorithm: SVTRv2
  in_channels: 3
  Transform:
  Encoder:
    name: SVTRv2LNConvTwo33
    use_pos_embed: False
    dims: [128, 256, 384]
    depths: [6, 6, 6]
    num_heads: [4, 8, 12]
    mixer: [['Conv', 'Conv', 'Conv', 'Conv', 'Conv', 'Conv'],
            ['Conv', 'Conv', 'FGlobal', 'Global', 'Global', 'Global'],
            ['Global', 'Global', 'Global', 'Global', 'Global', 'Global']]
    local_k: [[5, 5], [5, 5], [-1, -1]]
    sub_k: [[1, 1], [2, 1], [-1, -1]]
    last_stage: false
    feat2d: True
    dropout: 0.4
  Decoder:
    name: GTCDecoder
    infer_gtc: True
    detach: False
    dropout: 0.4
    gtc_decoder:
      name: SMTRDecoder
      num_layer: 1
      ds: True
      max_len: *max_text_length
      next_mode: &next True
      sub_str_len: &subsl 5
    ctc_decoder:
      name: RCTCDecoder

Loss:
  name: GTCLoss
  ctc_weight: 0.25
  gtc_loss:
    name: SMTRLoss

PostProcess:
  name: GTCLabelDecode
  gtc_label_decode:
    name: SMTRLabelDecode
    next_mode: *next
  character_dict_path: *character_dict_path
  use_space_char: *use_space_char

Metric:
  name: RecGTCMetric
  main_indicator: acc
  is_filter: True

Train:
  dataset:
    name: RatioDataSetTVResize
    ds_width: True
    padding: false
    data_dir_list:
      - /content/WordArt-V1.5/train/lmdb_dataset
    transforms:
      - DecodeImagePIL:
          img_mode: RGB
      - PARSeqAugPIL:
          degrees: 10
          translate: 0.15
          scale: [0.85, 1.15]
          shear: 10
          blur: True
          brightness: 0.3
          contrast: 0.3
          saturation: 0.3
          noise: True
      - GTCLabelEncode:
          gtc_label_encode:
            name: SMTRLabelEncode
            sub_str_len: *subsl
          character_dict_path: *character_dict_path
          use_space_char: *use_space_char
          max_text_length: *max_text_length
      - KeepKeys:
          keep_keys: ['image', 'label', 'label_subs', 'label_next',
                      'length_subs', 'label_subs_pre', 'label_next_pre',
                      'length_subs_pre', 'length', 'ctc_label', 'ctc_length']

  sampler:
    name: RatioSampler
    scales: [[128, 32]]
    first_bs: &bs 128
    fix_bs: false
    divided_factor: [4, 16]
    is_training: True

  loader:
    shuffle: True
    batch_size_per_card: *bs
    drop_last: True
    max_ratio: &max_ratio 12
    num_workers: 4

Eval:
  dataset:
    name: RatioDataSetTVResize
    ds_width: True
    padding: False
    data_dir_list:
      - /content/WordArt-V1.5/testA/lmdb_dataset
    transforms:
      - DecodeImagePIL:
          img_mode: RGB
      - GTCLabelEncode:
          gtc_label_encode:
            name: ARLabelEncode
          character_dict_path: *character_dict_path
          use_space_char: *use_space_char
          max_text_length: *max_text_length
      - KeepKeys:
          keep_keys: ['image', 'label', 'length', 'ctc_label', 'ctc_length']

  sampler:
    name: RatioSampler
    scales: [[128, 32]]
    first_bs: *bs
    fix_bs: false
    divided_factor: [4, 16]
    is_training: False

  loader:
    shuffle: False
    drop_last: False
    batch_size_per_card: *bs
    max_ratio: *max_ratio
    num_workers: 4
"""

# cria diretório se não existir
os.makedirs(os.path.dirname(config_path), exist_ok=True)

# escreve o novo conteúdo
with open(config_path, "w") as f:
    f.write(new_content)

print(f"Arquivo de configuração salvo em: {config_path}")

Arquivo de configuração salvo em: /content/svtrv2/configs/rec/svtrv2/svtrv2_smtr_gtc_rctc.yml


In [31]:
!cd "{svtrv2_path}" && python tools/train_rec.py \
  --config "{svtrv2_path}/configs/rec/svtrv2/svtrv2_smtr_gtc_rctc.yml" \
  --o Global.pretrained_model="{svtrv2_path}/output/rec/u14m_filter/svtrv2_smtr_gtc_rctc/best.pth" \
  Global.checkpoints="null"

[2025/11/09 22:35:04] openrec INFO: ----------- Config -----------
[2025/11/09 22:35:04] openrec INFO: Architecture : 
[2025/11/09 22:35:04] openrec INFO:     Decoder : 
[2025/11/09 22:35:04] openrec INFO:         ctc_decoder : 
[2025/11/09 22:35:04] openrec INFO:             name : RCTCDecoder
[2025/11/09 22:35:04] openrec INFO:         detach : False
[2025/11/09 22:35:04] openrec INFO:         dropout : 0.4
[2025/11/09 22:35:04] openrec INFO:         gtc_decoder : 
[2025/11/09 22:35:04] openrec INFO:             ds : True
[2025/11/09 22:35:04] openrec INFO:             max_len : 25
[2025/11/09 22:35:04] openrec INFO:             name : SMTRDecoder
[2025/11/09 22:35:04] openrec INFO:             next_mode : True
[2025/11/09 22:35:04] openrec INFO:             num_layer : 1
[2025/11/09 22:35:04] openrec INFO:             sub_str_len : 5
[2025/11/09 22:35:04] openrec INFO:         infer_gtc : True
[2025/11/09 22:35:04] openrec INFO:         name : GTCDecoder
[2025/11/09 22:35:04] openre

# Resultados com fine tuning

In [36]:
infer_imgs = f"{dataset_path}/testA/images"

!cd "{svtrv2_path}" && python "{svtrv2_path}/tools/infer_rec.py" --c "{svtrv2_path}/configs/rec/svtrv2/svtrv2_smtr_gtc_rctc_infer.yml" --o Global.infer_img="{infer_imgs}" Global.pretrained_model="/content/svtrv2/output/rec/u14m_filter/svtrv2_smtr_gtc_rctc/best.pth" Global.checkpoints="null"

  checkpoint = torch.load(pretrained_model, map_location=torch.device("cpu"))
[2025/11/09 23:53:29] openrec INFO: finetune from checkpoint /content/svtrv2/output/rec/u14m_filter/svtrv2_smtr_gtc_rctc/best.pth
[2025/11/09 23:53:29] openrec INFO: 0 /content/WordArt-V1.5/testA/images/new0.png	 result: RANCD	[0.9910755  0.99320084 0.9675328  0.5495025  0.92298436], time cost: 0.3156449794769287
[2025/11/09 23:53:29] openrec INFO: 1 /content/WordArt-V1.5/testA/images/new1.png	 result: GORiLLaZ	[0.99731004 0.9315637  0.9930587  0.6887319  0.9642558  0.9707897
 0.60621655 0.8841508 ], time cost: 0.01439046859741211
[2025/11/09 23:53:29] openrec INFO: 2 /content/WordArt-V1.5/testA/images/new10.png	 result: Republic	[0.9882112  0.9997155  0.9990069  0.99892443 0.995194   0.99764365
 0.9981664  0.99722266], time cost: 0.027187824249267578
[2025/11/09 23:53:29] openrec INFO: 3 /content/WordArt-V1.5/testA/images/new100.png	 result: Because	[0.9799702  0.9969796  0.99612516 0.99330515 0.99316436 0.9

In [37]:
with open(f"{svtrv2_path}/rec_results/rec_results.txt") as file:
    results = file.read().strip().replace("\n/", "\n//").split("\n/")

results = [line.strip().split("\t") for line in results]
results = [(path, pred, [float(c) for c in confidence.replace("\n", "").replace("[", "").replace("]", "").strip().split()]) for path, pred, confidence in results]

correct = 0
for img_path, label in zip(testA_images, testA_labels):
    for path, pred, _ in results:
        if pathlib.Path(img_path).stem == pathlib.Path(path).stem:
            correct += 1 if pred == label else 0
            break

print(f"{correct}/{len(results)} ({correct / len(results) * 100:.2f}%)")

2413/3000 (80.43%)
