In [7]:
!pip install utils pandas natsort nltk wandb tensorboard

Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.67.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.0.4-py3-none-any.whl.metadata (3.7 kB)
Downloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hDownloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K   [90m━━━━━━━

In [2]:
import os
import torch.backends.cudnn as cudnn
import yaml
from train import train
from utils import AttrDict
import pandas as pd

In [3]:
cudnn.benchmark = True
cudnn.deterministic = False

In [5]:
import unicodedata
def clean_text(text):
    # Нормализация до NFC
    text = unicodedata.normalize('NFC', text)
    
    # Удаление всех контролирующих символов (категория C*)
    text = ''.join(c for c in text if not unicodedata.category(c).startswith('C'))
    
    # Разрешённые комбинирующие символы
    allowed_combining = {'\u0301'}  # Только острый акцент
    text = ''.join(c for c in text if not unicodedata.combining(c) or c in allowed_combining)
    
    # Явное удаление специфических нежелательных символов
    text = text.replace('\u200e', '').replace('\u200f', '')  # Удаление LRM и RLM
    
    return text

In [6]:
def get_config(file_path):
    with open(file_path, 'r', encoding="utf8") as stream:
        opt = yaml.safe_load(stream)
    opt = AttrDict(opt)
    
    if opt.lang_char == 'None':
        characters = ''
        total_samples_before = 0
        total_samples_after = 0
        for data in opt['select_data'].split('-'):
            csv_path = os.path.join(opt['train_data'], data, 'labels.csv')
            df = pd.read_csv(csv_path, sep=',', engine='python', usecols=['filename', 'words'], keep_default_na=False)
            print(f"Прочитано {len(df)} примеров из {csv_path}")
            df['words'] = df['words'].apply(clean_text)
            print(f"Пример до очистки: {df['words'].iloc[0]}")
            # Удаление примеров с пустыми метками после очистки
            df = df[df['words'].str.strip() != '']
            print(f"Оставлено {len(df)} примеров после очистки из {csv_path}")
            total_samples_before += len(df)
            all_char = ''.join(df['words'])
            characters += ''.join(set(all_char))
        characters = sorted(set(characters))
        opt.character = ''.join(characters)
    else:
        opt.character = opt.number + opt.symbol + opt.lang_char
    
    print(f"Общее количество символов в opt.character: {len(opt.character)}")
    
    # Дополнительная проверка на наличие неподдерживаемых символов
    unsupported_chars = set()
    for data in opt['select_data'].split('-'):
        csv_path = os.path.join(opt['train_data'], data, 'labels.csv')
        df = pd.read_csv(csv_path, sep=',', engine='python', usecols=['filename', 'words'], keep_default_na=False)
        df['words'] = df['words'].apply(clean_text)
        for word in df['words']:
            for char in word:
                if char not in opt.character:
                    unsupported_chars.add(char)
    if unsupported_chars:
        print(f"Найдено неподдерживаемых символов: {unsupported_chars}")
    else:
        print("Неподдерживаемых символов не найдено.")
    
    os.makedirs(f'./saved_models/{opt.experiment_name}', exist_ok=True)
    return opt

In [None]:
def initialize_wandb(opt):
    wandb.init(
        project="EasyOCR_Yakut",  # Название проекта в wandb
        config=opt,               # Передаём конфигурацию как гиперпараметры
        name=opt.experiment_name, # Название запуска
        sync_tensorboard=True     # Если используете TensorBoard
    )
    # Логирование дополнительных параметров при необходимости
    wandb.config.update(opt)

In [None]:
opt = get_config("config_files/sah_filtered_config.yaml")
train(opt, amp=False)

Общее количество символов в opt.character: 57
Неподдерживаемых символов не найдено.
Filtering the images containing characters which are not in opt.character
Filtering the images whose label is longer than opt.batch_max_length
--------------------------------------------------------------------------------
dataset_root: all_data/
opt.select_data: ['sah_filtered']
opt.batch_ratio: ['1']
--------------------------------------------------------------------------------
dataset_root:    all_data/	 dataset: sah_filtered
all_data//sah_filtered
sub-directory:	/sah_filtered	 num samples: 31983
num total samples of sah_filtered: 31983 x 1.0 (total_data_usage_ratio) = 31983
num samples of sah_filtered per batch: 64 x 1.0 (batch_ratio) = 64
--------------------------------------------------------------------------------
Total_batch_size: 64 = 64
--------------------------------------------------------------------------------
dataset_root:    all_data/val	 dataset: /
all_data/val/
sub-directory:	/.