In [1]:
# importação dos dados
import os
import tarfile
from tqdm import tqdm
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

import torch
import torch.optim as optim
from torchsummaryX import summary

from mltu.torch.model import Model
from mltu.torch.losses import CTCLoss
from mltu.torch.dataProvider import DataProvider
from mltu.torch.metrics import CERMetric, WERMetric
from mltu.torch.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, Model2onnx, ReduceLROnPlateau

from mltu.preprocessors import ImageReader
from mltu.transformers import ImageResizer, LabelIndexer, LabelPadding, ImageShowCV2
from mltu.augmentors import RandomBrightness, RandomRotate, RandomErodeDilate, RandomSharpen
from mltu.annotations.images import CVImage

from model import Network
from configs import ModelConfigs

- Pré processamento dos dados

In [2]:
dataset, vocab, max_len = [], set(), 0
dataset_path = "../data/iam_data/"
# Processando o dataset pelo formato especifico do dataset IAM_Words
words = open(f"{dataset_path}words.txt", "r").readlines()
for line in tqdm(words):
    if line.startswith("#"):
        continue

    line_split = line.split(" ")
    if line_split[1] == "err":
        continue

    folder1 = line_split[0][:3]
    folder2 = "-".join(line_split[0].split("-")[:2])
    file_name = line_split[0] + ".png"
    label = line_split[-1].rstrip('\n')
    rel_path = f"{dataset_path}words/{folder1}/{folder2}/{file_name}"
    if not os.path.exists(rel_path):
        print(f"File not found: {rel_path}")
        continue

    dataset.append([rel_path, label])
    vocab.update(list(label))
    max_len = max(max_len, len(label))

configs = ModelConfigs()

# Save vocab and maximum text length to configs
configs.vocab = "".join(sorted(vocab))
configs.max_text_length = max_len
configs.save()

100%|██████████| 115320/115320 [00:08<00:00, 13576.64it/s]


In [3]:
dataset

[['../data/iam_data/words/a01/a01-000u/a01-000u-00-00.png', 'A'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-00-01.png', 'MOVE'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-00-02.png', 'to'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-00-03.png', 'stop'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-00-04.png', 'Mr.'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-00-05.png', 'Gaitskell'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-00-06.png', 'from'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-01-00.png', 'nominating'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-01-01.png', 'any'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-01-02.png', 'more'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-01-03.png', 'Labour'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-01-04.png', 'life'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-01-05.png', 'Peers'],
 ['../data/iam_data/words/a01/a01-000u/a01-000u-02-00.png', 'is'],
 ['../data/iam_data/words/a01

In [10]:
# Criação do modelo e definição dos hiperparâmetros
data_provider = DataProvider(
    dataset=dataset,
    skip_validation=True,
    batch_size=configs.batch_size,
    data_preprocessors=[ImageReader(CVImage)],
    transformers=[
        ImageShowCV2(),  # uncomment to show images when iterating over the data provider
        ImageResizer(configs.width, configs.height, keep_aspect_ratio=False),
        LabelIndexer(configs.vocab),
        LabelPadding(max_word_length=configs.max_text_length,
                     padding_value=len(configs.vocab))
    ],
    use_cache=True,
)

2023-06-05 15:32:53,692 INFO DataProvider: Skipping Dataset validation...


In [11]:
# nao funciona
for _ in data_provider:
    print(_)



(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))




(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))




(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))




(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))
(array([], dtype=float64), array([], dtype=float64))




In [117]:
# Separação dos dados em treino e teste (90% e 10%)
train_dataProvider, test_dataProvider = data_provider.split(split = 0.9)

In [118]:
# Augment training data with random brightness, rotation and erode/dilate
train_dataProvider.augmentors = [
    RandomBrightness(), 
    RandomErodeDilate(),
    RandomSharpen(),
    RandomRotate(angle=10), 
    ]

In [119]:
network = Network(len(configs.vocab), activation='leaky_relu', dropout=0.3)

In [120]:
loss = CTCLoss(blank=len(configs.vocab))
optimizer = optim.Adam(network.parameters(), lr=configs.learning_rate)

In [None]:
# uncomment to print network summary, torchsummaryX package is required
summary(network, torch.zeros((1, configs.height, configs.width, 3)))

In [121]:
# put on cuda device if available
if torch.cuda.is_available():
    network = network.cuda()

In [122]:
# create callbacks
earlyStopping = EarlyStopping(monitor="val_CER", patience=20, mode="min", verbose=1)
modelCheckpoint = ModelCheckpoint(configs.model_path + "/model.pt", monitor="val_CER", mode="min", save_best_only=True, verbose=1)
tb_callback = TensorBoard(configs.model_path + "/logs")
reduce_lr = ReduceLROnPlateau(monitor="val_CER", factor=0.9, patience=10, verbose=1, mode="min", min_lr=1e-6)
model2onnx = Model2onnx(
    saved_model_path=configs.model_path + "/model.pt",
    input_shape=(1, configs.height, configs.width, 3), 
    verbose=1,
    metadata={"vocab": configs.vocab}
    )

In [123]:
# create model object that will handle training and testing of the network
model = Model(network, optimizer, loss, metrics=[CERMetric(configs.vocab), WERMetric(configs.vocab)])
model.fit(
    train_dataProvider, 
    test_dataProvider, 
    epochs=1000, 
    callbacks=[earlyStopping, modelCheckpoint, tb_callback, reduce_lr, model2onnx]
    )

  0%|          | 0/1357 [00:00<?, ?it/s]


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 1 is not equal to len(dims) = 4

In [None]:
# Save training and validation datasets as csv files
train_dataProvider.to_csv(os.path.join(configs.model_path, "train.csv"))
test_dataProvider.to_csv(os.path.join(configs.model_path, "val.csv"))