In [1]:
import os
import argparse
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import cv2
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import string

2024-10-04 04:22:51.619803: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-04 04:22:51.636717: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-04 04:22:51.641922: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-04 04:22:51.654257: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Параметры
IMG_HEIGHT = 64         # Высота изображения
IMG_WIDTH = 256         # Ширина изображения
BATCH_SIZE = 4
MAX_LABEL_LENGTH = 30   # Максимальная длина текста
MARGIN = 0            # Отступ вокруг текста
BG_COLOR = 'white'      # Цвет фона
TEXT_COLOR = 'black'    # Цвет текста

# Пути к директориям
FONT_DIR = 'text/fonts/'
TEXT_FILE = 'text/sah.wordlist'
OUTPUT_IMAGE_DIR = 'test_images/images'
OUTPUT_LABEL_DIR = 'test_images/labels'
FONTS_LIST_FILE = 'text/okfonts_test.txt'

In [3]:
# Подготовка Словаря Символов
characters = string.ascii_letters + string.digits + 'абвгдеёжзийклмнопрстуфхцчшщъыьэюяҕҥөһүАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯҔҤӨҺҮ '

# Создание маппинга символов в числа и обратно
char_to_num = {char: idx + 1 for idx, char in enumerate(characters)}  # 0 для CTC blank
num_to_char = {idx + 1: char for idx, char in enumerate(characters)}

def text_to_labels(text):
    return [char_to_num[char] for char in text if char in char_to_num]

def labels_to_text(labels):
    return ''.join([num_to_char[label] for label in labels if label in num_to_char])

In [4]:
import glob
from sklearn.model_selection import train_test_split

In [5]:
# Создание списков путей к изображениям и меткам
def get_image_label_pairs(image_dir, label_dir):
    image_files = sorted(glob.glob(os.path.join(image_dir, '*.png')))
    label_files = sorted(glob.glob(os.path.join(label_dir, '*.txt')))
    image_label_pairs = []
    for img_path, lbl_path in zip(image_files, label_files):
        with open(lbl_path, 'r', encoding='utf-8') as f:
            label = f.read().strip()
        # Преобразуем текст в числовые метки
        label_seq = text_to_labels(label)
        if len(label_seq) > 0:  # Фильтруем метки с длиной > 0
            image_label_pairs.append((img_path, label))
    return image_label_pairs

In [6]:
image_label_pairs = get_image_label_pairs(OUTPUT_IMAGE_DIR, OUTPUT_LABEL_DIR)
print(f"Всего пар изображение-метка: {len(image_label_pairs)}")
# Ожидаемый вывод: Всего пар изображение-метка: 1000 * количество шрифтов

# Разделение данных на обучающую, валидационную и тестовую выборки
train_pairs, temp_pairs = train_test_split(image_label_pairs, test_size=0.2, random_state=42)
val_pairs, test_pairs = train_test_split(temp_pairs, test_size=0.5, random_state=42)

print(f"Train: {len(train_pairs)}, Validation: {len(val_pairs)}, Test: {len(test_pairs)}")

Всего пар изображение-метка: 75000
Train: 60000, Validation: 7500, Test: 7500


In [7]:
def preprocess_image(image_path, img_height, img_width):
    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
    if img is None:
        raise ValueError(f"Не удалось загрузить изображение: {image_path}")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (img_width, img_height))
    img = img / 255.0  # Нормализация
    return img.astype(np.float32)

def encode_sample(image_path, label):
    # Декодируем image_path и label из байтов в строки
    image_path = image_path.numpy().decode('utf-8')
    label = label.numpy().decode('utf-8')
    
    # Обрабатываем изображение
    img = preprocess_image(image_path, IMG_HEIGHT, IMG_WIDTH)
    
    # Преобразуем текст в числовые метки
    label = text_to_labels(label)
    
    # Обрезаем и дополняем метку до MAX_LABEL_LENGTH
    if len(label) > MAX_LABEL_LENGTH:
        label = label[:MAX_LABEL_LENGTH]
    else:
        label += [0] * (MAX_LABEL_LENGTH - len(label))
    
    # Преобразуем в numpy массив и обеспечиваем правильную форму
    label = np.array(label, dtype=np.int32).reshape(MAX_LABEL_LENGTH,)
    
    # Проверка длины метки
    assert len(label) == MAX_LABEL_LENGTH, f"Label length {len(label)} != MAX_LABEL_LENGTH {MAX_LABEL_LENGTH}"
    
    return img, label

def tf_encode(image_path, label):
    img, label = tf.py_function(
        func=encode_sample, 
        inp=[image_path, label], 
        Tout=[tf.float32, tf.int32]
    )
    img.set_shape([IMG_HEIGHT, IMG_WIDTH, 3])
    label.set_shape([MAX_LABEL_LENGTH])
    return img, label


def prepare_dataset(pairs):
    image_paths, labels = zip(*pairs)
    dataset = tf.data.Dataset.from_tensor_slices((list(image_paths), list(labels)))
    dataset = dataset.map(tf_encode, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

In [8]:
train_dataset = prepare_dataset(train_pairs)
val_dataset = prepare_dataset(val_pairs)
test_dataset = prepare_dataset(test_pairs)

2024-10-04 04:22:56.601129: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 18313 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:01:00.0, compute capability: 8.6
2024-10-04 04:22:56.601926: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 5383 MB memory:  -> device: 1, name: NVIDIA RTX A5000, pci bus id: 0000:24:00.0, compute capability: 8.6
2024-10-04 04:22:56.602530: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 16821 MB memory:  -> device: 2, name: NVIDIA RTX A5000, pci bus id: 0000:41:00.0, compute capability: 8.6
2024-10-04 04:22:56.603152: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 22296 MB memory:  -> device: 3, name: NVIDIA RTX A5000, pci bus id: 0000:61:00.0, c

In [9]:
def prepare_batch(images, labels):
    # Считаем количество ненулевых элементов в каждой метке
    label_length = tf.math.count_nonzero(labels, axis=1, dtype=tf.int32)

    # Преобразуем метки в нужную форму с использованием expand_dims
    labels = tf.expand_dims(labels, axis=-1)

    # Устанавливаем input_length
    input_length = tf.ones(shape=(tf.shape(images)[0],), dtype=tf.int32) * (IMG_WIDTH // 8)

    return {
        "input_image": images,
        "labels": labels,
        "input_length": input_length,
        "label_length": label_length
    }, None


In [10]:
# Применение batch, map и prefetch
train_dataset = train_dataset.batch(BATCH_SIZE).map(prepare_batch, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(BATCH_SIZE).map(prepare_batch, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).map(prepare_batch, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

In [11]:
for batch in train_dataset.take(1):
    inputs, _ = batch
    print("input_image shape:", inputs['input_image'].shape)      # (batch_size, 64, 256, 3)
    print("labels shape:", inputs['labels'].shape)                # (batch_size, 15)
    print("input_length shape:", inputs['input_length'].shape)    # (batch_size, 1)
    print("label_length shape:", inputs['label_length'].shape)    # (batch_size, 1)
    print("input_length dtype:", inputs['input_length'].dtype)    # int32
    print("label_length dtype:", inputs['label_length'].dtype)    # int32
    break

input_image shape: (4, 64, 256, 3)
labels shape: (4, 30, 1)
input_length shape: (4,)
label_length shape: (4,)
input_length dtype: <dtype: 'int32'>
label_length dtype: <dtype: 'int32'>


In [12]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Reshape, Dense, Bidirectional, LSTM, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50

In [13]:
def build_ocr_model(img_height, img_width, num_classes):
    # Входной слой для изображения
    input_img = Input(shape=(img_height, img_width, 3), name='input_image')

    # ResNet50 в качестве извлекателя признаков
    base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=input_img)

    # Примерно уменьшаем размерность по высоте и ширине изображения
    x = base_model.output

    # Преобразование для LSTM: Разворачиваем карты признаков в последовательность
    x = Reshape(target_shape=(-1, x.shape[-1]))(x)  # (None, W // 32 * H // 32, 2048)

    # RNN часть (рекуррентные слои для распознавания последовательности)
    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.25)(x)

    x = Bidirectional(LSTM(256, return_sequences=True))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.25)(x)

    # Выходной слой для предсказания вероятностей символов
    output = Dense(num_classes, activation='softmax', name='output')(x)

    # Модель
    model = Model(inputs=input_img, outputs=output)
    return model

# Параметры модели
num_classes = len(characters) + 1  # Количество символов + 1 для CTC blank
ocr_model = build_ocr_model(IMG_HEIGHT, IMG_WIDTH, num_classes)
ocr_model.summary()

In [14]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args

    # Проверка формы перед вызовом CTC loss
    print("y_pred shape:", tf.shape(y_pred))           # Должно быть (batch_size, time_steps, num_classes)
    print("labels shape:", tf.shape(labels))           # Должно быть (batch_size, max_label_length)
    print("input_length shape:", tf.shape(input_length))  # Должно быть (batch_size,)
    print("label_length shape:", tf.shape(label_length))  # Должно быть (batch_size,)

    # Преобразование input_length и label_length в одномерные массивы
    input_length = tf.squeeze(input_length, axis=-1)
    label_length = tf.squeeze(label_length, axis=-1)

    # Проверка после преобразования
    print("input_length after squeeze shape:", tf.shape(input_length))  # Должно быть (batch_size,)
    print("label_length after squeeze shape:", tf.shape(label_length))  # Должно быть (batch_size,)

    # Вычисляем CTC loss
    return tf.keras.backend.ctc_batch_cost(labels, y_pred, input_length, label_length)

def build_ctc_model(base_model):
    labels = Input(name='labels', shape=(MAX_LABEL_LENGTH,), dtype='int64')  # уберите квадратные скобки
    input_length = Input(name='input_length', shape=(), dtype='int64')  # Убедитесь, что это одномерный массив
    label_length = Input(name='label_length', shape=(), dtype='int64')  # Одномерный массив

    # Используем CTC loss
    loss_out = tf.keras.layers.Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')(
        [base_model.output, labels, input_length, label_length])

    model = Model(inputs=[base_model.input, labels, input_length, label_length], outputs=loss_out)
    model.compile(optimizer='adam', loss={'ctc': lambda y_true, y_pred: y_pred})
    return model

ctc_model = build_ctc_model(ocr_model)
ctc_model.summary()

In [15]:
# Обучение
history = ctc_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,  # Выберите количество эпох
    batch_size=BATCH_SIZE,
    verbose=1
)

Epoch 1/10


ValueError: Exception encountered when calling Functional.call().

[1mInvalid input shape for input Tensor("functional_1_1/Cast:0", shape=(None,), dtype=int64). Expected shape (None, 30), but input has incompatible shape (None,)[0m

Arguments received by Functional.call():
  • inputs={'input_image': 'tf.Tensor(shape=(None, 64, 256, 3), dtype=float32)', 'labels': 'tf.Tensor(shape=(None, 30, 1), dtype=int32)', 'input_length': 'tf.Tensor(shape=(None,), dtype=int32)', 'label_length': 'tf.Tensor(shape=(None,), dtype=int32)'}
  • training=True
  • mask={'input_image': 'None', 'labels': 'None', 'input_length': 'None', 'label_length': 'None'}