<a href="https://colab.research.google.com/github/lucas-azdias/Artistic-Text-Recognition/blob/main/artefatos/modelos/PARSeq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importações

In [None]:
!pip install -q numpy==2.0.1
!pip install -q torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1

In [None]:
!pip install -q pytorch-lightning==2.5.0 lmdb fire hydra-core imgaug==0.4.0
!pip install -q pyclipper pyyaml rapidfuzz gdown einops timm

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np
import pathlib
import PIL
import sys
import torch
import torchvision
import tqdm
import os

print("NumPy:", np.__version__)
print("Torch:", torch.__version__)

NumPy: 2.0.1
Torch: 2.4.1+cu121


## Obtendo dataset

### Clonando dataset

In [None]:
# URL do repositório
dataset_url = "https://github.com/lucas-azdias/Artistic-Text-Recognition-Dataset.git"
dataset_path = pathlib.Path("/content/WordArt-V1.5")

# Clonar o repositório
!rm -rf "{dataset_path}"
!git clone "{dataset_url}" "{dataset_path}"

Cloning into '/content/WordArt-V1.5'...
remote: Enumerating objects: 12063, done.[K
remote: Counting objects: 100% (53/53), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 12063 (delta 12), reused 14 (delta 0), pack-reused 12010 (from 1)[K
Receiving objects: 100% (12063/12063), 1.29 GiB | 17.34 MiB/s, done.
Resolving deltas: 100% (12/12), done.
Updating files: 100% (12005/12005), done.


### Carregando dataset

In [None]:
def load_data(base_path: pathlib.Path, name: str) -> tuple[list[pathlib.Path], list[str]]:
    labels_path = pathlib.Path(base_path, name, "labels.txt")

    image_paths = []
    classes = []

    with open(labels_path, "r") as f:
        for line in f:
            # Ex: "train_image\320.png Ford"
            splitted = line.strip().split(" ")
            path, label = splitted[0].replace("\\", "/"), " ".join(splitted[1:])

            # Ajustar o caminho para apontar para a pasta correta
            img_path = pathlib.Path(base_path, name, pathlib.Path(path))
            image_paths.append(img_path)
            classes.append(label)

    return image_paths, classes

train_images, train_labels = load_data(dataset_path, "train")
testA_images, testA_labels = load_data(dataset_path, "testA")
testB_images, testB_labels = load_data(dataset_path, "testB")

In [None]:
# URL do repositório
parseq_url = "https://github.com/baudm/parseq.git"
parseq_path = pathlib.Path("/content/parseq")

# Clonar o repositório
!rm -rf "{parseq_path}"
!git clone "{parseq_url}" "{parseq_path}"

if not str(parseq_path.absolute()) in sys.path:
    sys.path.append(str(parseq_path.absolute()))

Cloning into '/content/parseq'...
remote: Enumerating objects: 612, done.[K
remote: Counting objects: 100% (311/311), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 612 (delta 243), reused 193 (delta 193), pack-reused 301 (from 2)[K
Receiving objects: 100% (612/612), 1.34 MiB | 38.13 MiB/s, done.
Resolving deltas: 100% (344/344), done.


In [None]:
!wget -q -O Miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!bash Miniconda.sh -b -f -p miniconda > /dev/null
!rm Miniconda.sh

!sudo rm -f /usr/local/bin/conda
!ln -s /content/miniconda/bin/conda /usr/local/bin/conda

!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main
!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r

accepted Terms of Service for [4;94mhttps://repo.anaconda.com/pkgs/main[0m
accepted Terms of Service for [4;94mhttps://repo.anaconda.com/pkgs/r[0m


## Dataset

In [None]:
!python /content/parseq/tools/create_lmdb_dataset.py \
    "/content/WordArt-V1.5/train" \
    "/content/WordArt-V1.5/train/labels.txt" \
    "/content/data/train/real" \
    --checkValid=True

Written 1000 / 6000
Written 2000 / 6000
Written 3000 / 6000
Written 4000 / 6000
Written 5000 / 6000
Written 6000 / 6000
Created dataset with 6000 samples


In [None]:
!python /content/parseq/tools/create_lmdb_dataset.py \
    "/content/WordArt-V1.5/testA" \
    "/content/WordArt-V1.5/testA/labels.txt" \
    "/content/data/val/real" \
    --checkValid=True

Written 1000 / 3000
Written 2000 / 3000
Written 3000 / 3000
Created dataset with 3000 samples


In [None]:
!python /content/parseq/tools/create_lmdb_dataset.py \
    "/content/WordArt-V1.5/testB" \
    "/content/WordArt-V1.5/testB/labels.txt" \
    "/content/data/test/real" \
    --checkValid=True

Written 1000 / 3000
Written 2000 / 3000
Written 3000 / 3000
Created dataset with 3000 samples


## Ambiente

In [None]:
!conda env remove -n parseq -y -q > /dev/null 2>&1
!conda create -n parseq python=3.10 -y -q > /dev/null 2>&1

!conda run -n parseq pip install -q pip-tools
!conda run -n parseq make torch-cu121 -C "{parseq_path}"
!conda run -n parseq pip install -q -r "{parseq_path}/requirements/core.cu121.txt" -e {parseq_path}/.[bench,train,test,tune] --use-pep517

make: Entering directory '/content/parseq'
Generating requirements/core.cu121.txt
make: Leaving directory '/content/parseq'



## Pesos

In [None]:
!wget -O parseq.pt https://github.com/baudm/parseq/releases/download/v1.0.0/parseq-bb5792a6.pt

--2025-10-29 19:54:25--  https://github.com/baudm/parseq/releases/download/v1.0.0/parseq-bb5792a6.pt
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://release-assets.githubusercontent.com/github-production-release-asset/431325804/4f08baf8-bcc6-4f36-ab42-316b87e77ab5?sp=r&sv=2018-11-09&sr=b&spr=https&se=2025-10-29T20%3A43%3A50Z&rscd=attachment%3B+filename%3Dparseq-bb5792a6.pt&rsct=application%2Foctet-stream&skoid=96c2d410-5711-43a1-aedd-ab1947aa7ab0&sktid=398a6654-997b-47e9-b12b-9515b896b4de&skt=2025-10-29T19%3A42%3A59Z&ske=2025-10-29T20%3A43%3A50Z&sks=b&skv=2018-11-09&sig=rTUTW%2FqA9AXHx9jSaxT%2BKkipnSQQAzYfxNOXhXdnycc%3D&jwt=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmVsZWFzZS1hc3NldHMuZ2l0aHVidXNlcmNvbnRlbnQuY29tIiwia2V5Ijoia2V5MSIsImV4cCI6MTc2MTc2OTQ2NSwibmJmIjoxNzYxNzY3NjY1LCJwYXRoIjoicmVsZWFzZWFzc2V0cHJvZHVjdGl

# Fine Tuning


In [None]:
!conda run -n parseq python "{parseq_path}/tune.py" --help

tune is powered by Hydra.

== Configuration groups ==
Compose your configuration from those groups (group=option)

charset: 36_lowercase, 62_mixed-case, 94_full
dataset: real, synth
experiment: abinet, abinet-sv, crnn, parseq, parseq-patch16-224, parseq-tiny, trba, trbc, tune_abinet-lm, vitstr
model: abinet, crnn, parseq, trba, vitstr


== Config ==
Override anything in the config (foo.bar=value)

model:
  _convert_: all
  img_size:
  - 32
  - 128
  max_label_length: 25
  charset_train: 0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
  charset_test: 0123456789abcdefghijklmnopqrstuvwxyz
  batch_size: 384
  weight_decay: 0.0
  warmup_pct: 0.075
  name: parseq
  _target_: strhub.models.parseq.system.PARSeq
  patch_size:
  - 4
  - 8
  embed_dim: 384
  enc_num_heads: 6
  enc_mlp_ratio: 4
  enc_depth: 12
  dec_num_heads: 12
  dec_mlp_ratio: 4
  dec_depth: 1
  lr: 0.0007
  perm_num: 6
  perm_forward: true
  perm_mirrored: true
  dropout: 0.1
  de

In [None]:
%set_env HYDRA_FULL_ERROR=1

env: HYDRA_FULL_ERROR=1


In [None]:
# Função para alterar trechos dos arquivos alvos
def modify_target_files(target_files: list[pathlib.Path], old_texts: list[str], new_texts: list[str]) -> None:
    for target_file in target_files:
        text = target_file.read_text(encoding="utf-8")

        for old_text, new_text in zip(old_texts, new_texts):
            text = text.replace(
                old_text,
                new_text
            )

        target_file.write_text(text, encoding="utf-8")

In [None]:
# Modificando trechos necessários
modify_target_files(
    [
        parseq_path / "tune.py",
    ],
    [
        "from strhub.data.module import SceneTextDataModule\nfrom strhub.models.base import BaseSystem",
        "model: BaseSystem = hydra.utils.instantiate(config.model)\n    datamodule: SceneTextDataModule = hydra.utils.instantiate(config.data)",
    ],
    [
        "from strhub.data.module import SceneTextDataModule\nfrom torch import load\nfrom strhub.models.base import BaseSystem",
        "model: BaseSystem = hydra.utils.instantiate(config.model)\n    if config.pretrained is not None:\n        m = model.model if config.model._target_.endswith('PARSeq') else model\n        m.load_state_dict(load(config.pretrained))\n    datamodule: SceneTextDataModule = hydra.utils.instantiate(config.data)",
    ],
)

In [None]:
!conda run -n parseq python "{parseq_path}/tune.py" \
trainer.max_epochs=5 \
trainer.val_check_interval=16 \
pretrained="/content/parseq.pt" \
tune.lr.max=0.0001 \
tune.lr.min=0.00001 \
model.dropout=0

[2025-10-29 20:08:49,367][strhub.data.dataset][INFO] - dataset root:	/content/data/train/real
[2025-10-29 20:08:49,381][strhub.data.dataset][INFO] - 	lmdb:	.	num samples: 6000
╭────────────────────────────────────────────────────────╮
│ Configuration for experiment     2025-10-29_20-08-49   │
├────────────────────────────────────────────────────────┤
│ Search algorithm                 SearchGenerator       │
│ Scheduler                        MedianStoppingRule    │
│ Number of trials                 10                    │
╰────────────────────────────────────────────────────────╯

View detailed results here: /content/ray_results/parseq/2025-10-29_20-08-49
To visualize your results with TensorBoard, run: `tensorboard --logdir /content/ray_results/parseq/2025-10-29_20-08-49`

Trial status: 1 PENDING
Current time: 2025-10-29 20:08:52. Total running time: 0s
Logical resource usage: 0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:L4)
╭────────────────────────────────────────╮
│ Trial name  

# Execução

In [None]:
# Modificando trechos necessários
modify_target_files(
    [
        parseq_path / "read.py",
    ],
    [
        "import argparse",
        "nargs='+', ",
        "for fname in args.images:",
        "Image.open(fname)",
        "print(f'{fname}: {pred[0]}')",
    ],
    [
        "import argparse\nfrom pathlib import Path",
        "",
        "text = \"\"\n    imgs = Path(args.images)\n    for fname in (imgs.rglob(\"*\") if imgs.is_dir() else [imgs]):",
        "Image.open(fname.absolute())",
        "text += f\"{fname}\t{pred[0]}\t{p[0].tolist()[:-1]}\\n\"\n    print(text)\n    open(\"outputs.txt\", \"w\").write(text)",
    ],
)

In [None]:
import re

normalize = lambda s: re.sub(r'[^a-z0-9]', '', s.lower())
compare = lambda a, b: normalize(a) == normalize(b)

for best in sorted([f for f in pathlib.Path(f"/content/ray_results/parseq").iterdir() if f.is_dir()])[-1].rglob("checkpoint"):
    infer_imgs = f"/content/WordArt-V1.5/testB/images"

    !cd {parseq_path} && conda run -n parseq python "{parseq_path}/read.py" \
    "{best}" \
    --images="{infer_imgs}" \
    --device=cuda \
    > /dev/null 2>&1

    with open(f"{parseq_path}/outputs.txt") as file:
                raw_results = file.read().strip().split("\n")

    raw_results = [line.strip().split("\t") for line in raw_results]
    raw_results = [
        tuple(
            [
                pathlib.Path(path),
                pred,
                tuple([
                    float(c)
                    for c in confidence.replace("[", "").replace("]", "").strip().split(", ")
                ])
            ]
        )
        for path, pred, confidence in raw_results
    ]

    correct = 0
    for img_path, label in zip(testB_images, testB_labels):
        for path, pred, _ in raw_results:
            if pathlib.Path(img_path).stem == pathlib.Path(path).stem:
                correct += 1 if compare(pred, label) else 0
                break

    print(f"{(best / '..' / '..').resolve().name}/{(best / '..').resolve().name}: {correct}/{len(raw_results)} ({correct / len(raw_results) * 100:.2f}%)")

trainable_4639b809_1_lr=0.0001_2025-10-29_20-08-52/checkpoint_000000: 2625/3000 (87.50%)
trainable_4639b809_1_lr=0.0001_2025-10-29_20-08-52/checkpoint_000003: 2625/3000 (87.50%)
trainable_4639b809_1_lr=0.0001_2025-10-29_20-08-52/checkpoint_000002: 2625/3000 (87.50%)
trainable_4639b809_1_lr=0.0001_2025-10-29_20-08-52/checkpoint_000001: 2625/3000 (87.50%)
trainable_a6b0145c_6_lr=0.0000_2025-10-29_20-11-50/checkpoint_000000: 2619/3000 (87.30%)
trainable_bdae9971_4_lr=0.0000_2025-10-29_20-10-23/checkpoint_000000: 2619/3000 (87.30%)
trainable_09e90a3e_7_lr=0.0000_2025-10-29_20-13-15/checkpoint_000000: 2618/3000 (87.27%)
trainable_09e90a3e_7_lr=0.0000_2025-10-29_20-13-15/checkpoint_000003: 2624/3000 (87.47%)
trainable_09e90a3e_7_lr=0.0000_2025-10-29_20-13-15/checkpoint_000002: 2623/3000 (87.43%)
trainable_09e90a3e_7_lr=0.0000_2025-10-29_20-13-15/checkpoint_000001: 2623/3000 (87.43%)
trainable_96375eb2_5_lr=0.0001_2025-10-29_20-11-08/checkpoint_000000: 2623/3000 (87.43%)
trainable_96375eb2_5_

In [None]:
# trainable_3d5f6386_7_lr=0.0007_2025-10-29_18-39-26/checkpoint_000000: 2613/3000 (87.10%)
# trainable_3d5f6386_7_lr=0.0007_2025-10-29_18-39-26/checkpoint_000002: 2600/3000 (86.67%)
# trainable_3d5f6386_7_lr=0.0007_2025-10-29_18-39-26/checkpoint_000001: 2600/3000 (86.67%)
# trainable_fb6c0618_5_lr=0.0002_2025-10-29_18-37-13/checkpoint_000000: 2626/3000 (87.53%)
# trainable_fb6c0618_5_lr=0.0002_2025-10-29_18-37-13/checkpoint_000002: 2626/3000 (87.53%)
# trainable_fb6c0618_5_lr=0.0002_2025-10-29_18-37-13/checkpoint_000001: 2626/3000 (87.53%)
# trainable_9418a129_6_lr=0.0001_2025-10-29_18-37-59/checkpoint_000000: 2628/3000 (87.60%)
# trainable_9418a129_6_lr=0.0001_2025-10-29_18-37-59/checkpoint_000003: 2623/3000 (87.43%)
# trainable_9418a129_6_lr=0.0001_2025-10-29_18-37-59/checkpoint_000002: 2623/3000 (87.43%)
# trainable_9418a129_6_lr=0.0001_2025-10-29_18-37-59/checkpoint_000001: 2623/3000 (87.43%)
# trainable_d5923f7d_10_lr=0.0001_2025-10-29_18-44-45/checkpoint_000000: 2629/3000 (87.63%)
# trainable_d5923f7d_10_lr=0.0001_2025-10-29_18-44-45/checkpoint_000003: 2620/3000 (87.33%)
# trainable_d5923f7d_10_lr=0.0001_2025-10-29_18-44-45/checkpoint_000002: 2619/3000 (87.30%)
# trainable_d5923f7d_10_lr=0.0001_2025-10-29_18-44-45/checkpoint_000001: 2618/3000 (87.27%)
# trainable_8be2ee24_2_lr=0.0004_2025-10-29_18-35-00/checkpoint_000000: 2621/3000 (87.37%)
# trainable_a06efcf3_4_lr=0.0004_2025-10-29_18-36-30/checkpoint_000000: 2598/3000 (86.60%)
# trainable_145f6c4f_1_lr=0.0020_2025-10-29_18-34-56/checkpoint_000000: 2492/3000 (83.07%)
# trainable_145f6c4f_1_lr=0.0020_2025-10-29_18-34-56/checkpoint_000001: 2492/3000 (83.07%)
# trainable_b56d9625_8_lr=0.0012_2025-10-29_18-40-53/checkpoint_000000: 2545/3000 (84.83%)
# trainable_421fb136_3_lr=0.0001_2025-10-29_18-35-48/checkpoint_000000: 2632/3000 (87.73%)
# trainable_421fb136_3_lr=0.0001_2025-10-29_18-35-48/checkpoint_000003: 2630/3000 (87.67%)
# trainable_421fb136_3_lr=0.0001_2025-10-29_18-35-48/checkpoint_000002: 2630/3000 (87.67%)
# trainable_421fb136_3_lr=0.0001_2025-10-29_18-35-48/checkpoint_000001: 2632/3000 (87.73%)