# The begin

In [1]:
import cv2
import numpy as np
import pandas as pd
import scipy.stats as ss
import matplotlib.pyplot as plt


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as data

import scipy.stats as ss

import torchvision.transforms as transforms
import torchvision.datasets as datasets

from sklearn import metrics
from sklearn import decomposition
from sklearn import manifold
from tqdm.notebook import trange, tqdm

from PIL import Image, ImageDraw, ImageFont
import PIL

import os

import copy
import random
import time

import tracemalloc

%matplotlib inline

In [None]:
bert_path = "cointegrated/rubert-tiny2"
vit_path = "google/vit-base-patch16-224-in21k"

In [None]:
import gc


def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

In [None]:
!pip install sentencepiece
!pip install levenshtein
!pip install transformers

In [None]:
from transformers import (
    AutoModel,
    AutoTokenizer,
    BertConfig,
    DeiTConfig,
    DeiTModel,
    DeiTFeatureExtractor,
    TrOCRConfig,
    TrOCRProcessor,
    TrOCRForCausalLM,
    ViTConfig,
    ViTModel,
    VisionEncoderDecoderModel,
    VisionEncoderDecoderConfig,
    ViTFeatureExtractor,
)

In [None]:
from google.colab import drive
drive.mount('./data')

In [None]:
cd data/MyDrive/ColabNotebooks/Doc_recognition/Passports

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [None]:
font_size = 15
# passport_font = "fonts/arial_0.ttf"
passport_font = "fonts/ocrb.ttf"

symbols = " .-1234567890АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
# symbols = " .-1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ"

length = 1 + ss.poisson(1).rvs(size=1)
text = ''.join(np.random.choice(list(symbols), size=length))

image = PIL.Image.open("photo_data/backgrounds/white.jpg").resize((224, 32))
w, h = image.size

draw = ImageDraw.Draw(image)
a, b = draw.textsize(text, font=ImageFont.truetype(passport_font, font_size))
print(a, b, w, h)

draw.text((0, 0), text, font=ImageFont.truetype(passport_font, font_size), fill=(50, 50, 50))
image = image.crop((0, 0, a + 1, b + 1)).resize((224, 32))

print(text)
plt.imshow(image)

## Датасет и даталоадер
Первая часть модели отвечает за нахождение региончиков с текстом.  
Происходит патчевая классификация. То есть таргет для картинки - это 2d тензор размера *\[H/P, W/P\]*, где для каждого патча выставлен класс, к которому от относится.  
Классов всего 16: 14 видов текстовых полей + *'photo'* + *'blank'*

In [None]:
class PassportDataset(Dataset):
    def __init__(self, size, length, processor, w=224, h=32):
        """
        pd_table - path to the dataset
        img_dir - directory with passport images
        w, h - the desirable size of the image (to resize)
        p - patch size
        from_, to_ - set them to take a part of a dataset, not the whole
        """
        self.w, self.h = w, h

        self.processor = processor
        self.size = size
        self.length = length
        self.img = PIL.Image.open("photo_data/backgrounds/cloth.jpg").resize((w, h))
        # self.passport_font = "fonts/arial_0.ttf"
        self.passport_font = "fonts/ocrb.ttf"

        self.transforms = transforms.Compose([
                transforms.Resize((h, w)),
                transforms.ColorJitter(brightness=(0.9, 1.1), contrast=(0.8, 1.25)),
                transforms.ToTensor()
            ])
        self.shape = (w, h)  # shape: x, y - for PIL image

    def __len__(self):
        return self.size

    def __getitem__(self, idx):
        font_size = np.random.randint(10, 16)
        symbols = " .-1234567890АБВГДЕЁЖЗИЙКЛМНОПРСТФХЦЧШЩЪЫЬЭЮЯ"
        # symbols = " .-1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        length = self.length + ss.poisson(1).rvs(size=1)
        text = ''.join(np.random.choice(list(symbols), size=length))

        image = self.img.copy()
        w, h = image.size

        draw = ImageDraw.Draw(image)
        a, b = draw.textsize(text, font=ImageFont.truetype(self.passport_font, font_size))
        
        draw.text((0, 0), text, font=ImageFont.truetype(self.passport_font, font_size), fill=(50, 50, 50))
        image = image.crop((0, 0, a + 1, b + 1))

        # tensor = self.transforms(image)
        # del image

        text = torch.tensor(self.processor.tokenizer(text, padding="max_length", max_length=32).input_ids)

        # return self.processor(tensor, return_tensors="pt").pixel_values[0], text
        return self.processor(images=image, return_tensors="pt").pixel_values[0], text

In [None]:
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')

In [None]:
train_data = PassportDataset(100, 2, processor)
test_data = PassportDataset(25, 2, processor)

In [None]:
x, t = train_data[0]
print(t.shape)

In [None]:
train_dataloader = DataLoader(train_data, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=2, shuffle=True)

In [None]:
for x, t in train_dataloader:
    print(t.shape)
    break

## Обучение itself

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## Подготовка к распознаванию

# Свои train и eval

In [None]:
import Levenshtein as pylev

In [None]:
def train_full(model, train_data, optimizer, device):
    model.train()
    epoch_loss = 0
    normal_crop_loss = 0.04

    for X, texts in tqdm(train_data, desc="Training"):
        model.zero_grad()
        loss = model(pixel_values=X.to(device), labels=texts.to(device)).loss

        cleanup()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        del X, texts
        cleanup()

    return epoch_loss / len(train_data)


def evaluate_full(model, val_data, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for X, texts in tqdm(val_data, desc="Evaluating"):
            loss = model(pixel_values=X.to(device), labels=texts.to(device)).loss

            cleanup()

            epoch_loss += loss.item()
            del X, texts
            cleanup()

    return epoch_loss / len(val_data)


def count_levenshtein(model, processor, val_data, device):
    model.eval()
    epoch_lev_dist = []
    epoch_cer = []
    with torch.no_grad():
        for X, texts in tqdm(val_data, desc="Evaluating levenshtein"):        
            outputs = model.generate(X.to(device), max_length=32)
            cleanup()

            targets = processor.batch_decode(texts, skip_special_tokens=True)
            preds = processor.batch_decode(outputs, skip_special_tokens=True)

            for pred, target in zip(preds, targets):
                if np.random.randint(100) < 4:
                    print(f"[{pred}] vs [{target}]")
                epoch_lev_dist.append(pylev.distance(pred, target))
                if pred or target:
                    epoch_cer.append(epoch_lev_dist[-1] / max(len(pred), len(target)))

            del X, texts
            cleanup()

    return np.mean(epoch_lev_dist), np.mean(epoch_cer)

# Final train

In [19]:
bert_path = "cointegrated/rubert-tiny2"
vit_path = "google/vit-base-patch16-224-in21k"

In [20]:
tokenizer = AutoTokenizer.from_pretrained(bert_path)
feature_extractor = DeiTFeatureExtractor.from_pretrained(vit_path)

processor = TrOCRProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.image_processor.size = {'height': 32, 'width': 224}



In [21]:
processor

TrOCRProcessor:
- image_processor: DeiTFeatureExtractor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "DeiTFeatureExtractor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 32,
    "width": 224
  }
}

- tokenizer: BertTokenizerFast(name_or_path='cointegrated/rubert-tiny2', vocab_size=83828, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [22]:
config_encoder = ViTConfig.from_pretrained(vit_path)
config_decoder = BertConfig.from_pretrained(bert_path)
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

recognize_model = VisionEncoderDecoderModel(config)
# set special tokens used for creating the decoder_input_ids from the labels
recognize_model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
recognize_model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
recognize_model.config.vocab_size = recognize_model.config.decoder.vocab_size

# set beam search parameters
recognize_model.config.eos_token_id = processor.tokenizer.sep_token_id
recognize_model.config.max_length = 64
recognize_model.config.early_stopping = True
recognize_model.config.no_repeat_ngram_size = 3
recognize_model.config.length_penalty = 2.0
recognize_model.config.num_beams = 3

In [23]:
# recognize_model

# What if

In [24]:
processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')
# processor.image_processor.size = {'height': 32, 'width': 224}

recognize_model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten')

recognize_model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
recognize_model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
recognize_model.config.vocab_size = recognize_model.config.decoder.vocab_size

# set beam search parameters
recognize_model.config.eos_token_id = processor.tokenizer.sep_token_id
recognize_model.config.max_length = 64
recognize_model.config.early_stopping = True
recognize_model.config.no_repeat_ngram_size = 3
recognize_model.config.length_penalty = 1.0
recognize_model.config.num_beams = 3

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-small-handwritten and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
processor

TrOCRProcessor:
- image_processor: DeiTImageProcessor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": false,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "DeiTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 384,
    "width": 384
  }
}

- tokenizer: XLMRobertaTokenizerFast(name_or_path='microsoft/trocr-small-handwritten', vocab_size=64002, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_classes = 16
iteration = 0

if iteration > 0:
    if device == 'cuda':
        recognize_model.load_state_dict(torch.load(f'models_backups/rand_just_recognize_latest_{iteration - 1}.pt'))
    else:
        recognize_model.load_state_dict(torch.load(f'models_backups/rand_just_recognize_latest_{iteration - 1}.pt', map_location=torch.device('cpu')))

# crop_model.to(device)
# recognize_model.to(device)
recognize_model.to(device)

optimizer = optim.Adam(recognize_model.parameters(), lr=1e-5)

warmup_scheduler = optim.lr_scheduler.LinearLR(optimizer, start_factor=1e-3, total_iters=5)
gamma_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.25)
# cosine_scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 15)
# main_scheduler = optim.lr_scheduler.ChainedScheduler([cosine_scheduler, gamma_scheduler])
# lr_scheduler = optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[10])
lr_scheduler = optim.lr_scheduler.SequentialLR(optimizer, schedulers=[warmup_scheduler, gamma_scheduler], milestones=[10])

lengths = [4, 4, 5, 7, 10]

f"Working on {device}"

'Working on cuda'

In [27]:
X, texts = None, None
for data in train_dataloader:
    X, texts = data

outputs = recognize_model.generate(X.to(device), max_length=5)

targets = processor.batch_decode(texts, skip_special_tokens=True)
preds = processor.batch_decode(outputs, skip_special_tokens=True)

print(targets)
print(preds)



['6', 'Ж73']
[".'' '", '.']


In [None]:
for i, length in enumerate(lengths):
    train_data = PassportDataset(500, length, processor)
    test_data = PassportDataset(100, length, processor)
    train_dataloader = DataLoader(train_data, batch_size=16, shuffle=False)
    test_dataloader = DataLoader(test_data, batch_size=16, shuffle=False)

    EPOCHS = 20
    train_crop_loss_storage, valid_crop_loss_storage = [], []
    train_recognize_loss_storage, valid_recognize_loss_storage = [], []

    valid_recognize_loss = evaluate_full(recognize_model, test_dataloader, device)
    valid_lev_dist, valid_cer = count_levenshtein(recognize_model, processor, test_dataloader, device)
    print('Before training')
    print(f'\t Val. recognizing loss: {valid_recognize_loss:.3f}')
    print(f'\t Val. Levenshein distance: {valid_lev_dist:.3f}')
    print(f'\t Val. Cer: {valid_cer:.3f}')

    best_valid_loss = valid_recognize_loss

    for epoch in range(EPOCHS):
        print(f"Epoch #{epoch + 1}")
        start_time = time.monotonic()

        train_recognize_loss = train_full(recognize_model, train_dataloader, optimizer, device)
        train_recognize_loss_storage.append(train_recognize_loss)

        valid_recognize_loss = evaluate_full(recognize_model, test_dataloader, device)
        valid_recognize_loss_storage.append(valid_recognize_loss)

        if valid_recognize_loss < best_valid_loss:
            best_valid_loss = valid_recognize_loss
            torch.save(recognize_model.state_dict(), f'models_backups/rand_just_recognize_best_{iteration}.pt')
        torch.save(recognize_model.state_dict(), f'models_backups/rand_just_recognize_latest_{iteration}.pt')

        valid_vel_dist, valid_cer = count_levenshtein(recognize_model, processor, test_dataloader, device)

        lr_scheduler.step()
        end_time = time.monotonic()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        print(f'Epoch: {i * 20 + epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\t Val. recognize loss: {valid_recognize_loss:.3f}')
        print(f'\t Val. Levenshein distance: {valid_lev_dist:.3f}')
        print(f'\t Val. Cer: {valid_cer:.3f}')

        cleanup()

        lr_scheduler.step()

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[. 'fully.'you'. you'll know you'!. you you you know you.] vs [ЗМ3Ц]
[in] vs [ЭЦШ0]
[. " you-] vs [ВЧХ-]
[not for the work for the workshops that write written that written that the work with the work written that workshops that written workshops that that write workshops that the workshops] vs [5ОГЯ64]
Before training
	 Val. recognizing loss: 20.922
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.956
Epoch #1


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[.] vs [-З0]
[in] vs [.ЕЖП]
[in] vs [.ЕШД]
Epoch: 01 | Epoch Time: 1m 14s
	 Val. recognize loss: 20.678
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.991
Epoch #2


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[white] vs [ЕНМФ]
[100 000] vs [.-8РББЗ]
[100 1] vs [Ж 5]
[100 1 100 1] vs [5ЛН-М]
[] vs [-Я4А0.]
Epoch: 02 | Epoch Time: 1m 8s
	 Val. recognize loss: 7.954
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.995
Epoch #3


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[] vs [Е9ДМ]
[whitewash] vs [М-ХЕ]
[whitewash Kardashians unfaithful  81 82 weirdness Marshawn__ selflessness] vs [2Л1И8]
[reassess($ Syriza Sarfraz __ jackassPRESS weirdness footfall] vs [НЕЕНТЗЗ]
[overpass] vs [ХР ЯЭ]
Epoch: 03 | Epoch Time: 1m 10s
	 Val. recognize loss: 2.713
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.996
Epoch #4


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[Kardashians ] vs [В.]
[] vs [Е80Х]
[Kardashians] vs [Н83ПЛ]
[] vs [КЭ9Т.]
[] vs [Б6.Н]
[] vs [-РРЕ]
Epoch: 04 | Epoch Time: 1m 11s
	 Val. recognize loss: 1.961
	 Val. Levenshein distance: 23.400
	 Val. Cer: 1.000
Epoch #5


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[] vs [ЛБ ]
[MeToo  Kardashians Wahhabi Manziel symbolise Syriza Firefox SaaS] vs [М4Б]
[MeToo Firefox  SiriusXM Kardashians] vs [-Е5 6]
[Shabab ] vs [Х2РЯ]
Epoch: 05 | Epoch Time: 1m 13s
	 Val. recognize loss: 1.530
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.998
Epoch #6




Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[ Shabab Firefox Kardashians Syriza SiriusXMК Manziel] vs [-А ]
[  Kardashians Kardashians friggin] vs [ФОШ]
[ ] vs [ТКШ]
[ 0] vs [МИЛЕ6З]
Epoch: 06 | Epoch Time: 1m 13s
	 Val. recognize loss: 1.310
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.982
Epoch #7


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[  friggin К Wahhabi Kardashians Firefox] vs [3З0КГД]
Epoch: 07 | Epoch Time: 1m 13s
	 Val. recognize loss: 1.204
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.990
Epoch #8


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[   2.2%-6)  KardashiansК] vs [-4ЧС]
[ ] vs [Ч9Д-]
Epoch: 08 | Epoch Time: 1m 15s
	 Val. recognize loss: 1.112
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.989
Epoch #9


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[] vs [3П2Ч]
Epoch: 09 | Epoch Time: 1m 12s
	 Val. recognize loss: 0.973
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.985
Epoch #10


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[ ] vs [АЭЗ3ТШЧ]
[] vs [.3Р49]
[  МКМ К Н] vs [ОИО4]
Epoch: 10 | Epoch Time: 1m 15s
	 Val. recognize loss: 0.910
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.982
Epoch #11


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[  ЯМ МН] vs [НДКЭН]
[М Ц KardashiansМННН Kardashians] vs [ШЗ2СП]
[  М КХ Kardashians] vs [Ш2КДРФ]
[ Ц KardashiansХ SyrizaКР Kardashians] vs [ГО2ДЦ 52]
[  М ЦКНН] vs [ЖФИХ]
[АА] vs [0Л115]
Epoch: 11 | Epoch Time: 1m 13s
	 Val. recognize loss: 0.884
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.973
Epoch #12


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[М ЯХ dodgy  dodgyХ] vs [Ц5ББК]
[ ЯМ МННН] vs [ТПШ]
[ Я. ХК Syriza] vs [ЕСК]
Epoch: 12 | Epoch Time: 1m 13s
	 Val. recognize loss: 0.811
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.967
Epoch #13


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[ ЦА] vs [ДИМГ П]
[А] vs [-4ВФ]
[ А] vs [ЧХВ..ГФ]
Epoch: 13 | Epoch Time: 1m 8s
	 Val. recognize loss: 0.752
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.972
Epoch #14


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[А] vs [Б26Ш]
[ФАА] vs [Ф60И5]
[А] vs [9926]
Epoch: 14 | Epoch Time: 1m 13s
	 Val. recognize loss: 0.704
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.964
Epoch #15


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[Ф] vs [7СРЯК]
[А] vs [9-А0]
Epoch: 15 | Epoch Time: 1m 6s
	 Val. recognize loss: 0.697
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.961
Epoch #16


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[] vs [84]
[А] vs [7Ш2Б]
[П] vs [СЧ8П]
[ПА] vs [46ННЗ]
Epoch: 16 | Epoch Time: 1m 5s
	 Val. recognize loss: 0.705
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.955
Epoch #17


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[А] vs [З81]
[А] vs [ЗРБЭ ]
[ФП] vs [ЯОПАИ]
Epoch: 17 | Epoch Time: 1m 6s
	 Val. recognize loss: 0.693
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.978
Epoch #18


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[А] vs [ОЛГР]
[Ф] vs [ХШБ1ЧЖ]
[А] vs [7В ]
[ФП] vs [5ШЧО]
[А] vs [6Ж]
[А] vs [ХАЖ5В]
[ПА] vs [5Ш4Ф1К]
Epoch: 18 | Epoch Time: 1m 6s
	 Val. recognize loss: 0.705
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.973
Epoch #19


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[Ф] vs [ЦЕО]
[А] vs [ЭФ8В]
[АА] vs [13Г]
[А] vs [ЛЭЗХ64]
[А] vs [М2 ]
[Ф] vs [ЧХТК]
Epoch: 19 | Epoch Time: 1m 6s
	 Val. recognize loss: 0.665
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.973
Epoch #20


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[АА] vs [ОАКО]
[ФЯ] vs [МШК.Ф]
[АА] vs [БЭДБВ]
[ФА] vs [К9ХБДС]
[А] vs [3ЗП-]
[Ф] vs [41Ш ]
[А] vs [85Н]
[НН] vs [Ф7Е]
Epoch: 20 | Epoch Time: 1m 5s
	 Val. recognize loss: 0.683
	 Val. Levenshein distance: 23.400
	 Val. Cer: 0.973


Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[Ф] vs [9ИК]
[ПА] vs [0НЦБ0]
[РА] vs [36Е]
Before training
	 Val. recognizing loss: 0.668
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.977
Epoch #1


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[3Ц] vs [9133]
[Н] vs [О1Ш8Ц]
[Ч] vs [КЦ4]
[А] vs [Ч2БЗ7]
[А] vs [ЭТ]
Epoch: 21 | Epoch Time: 1m 3s
	 Val. recognize loss: 0.670
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.955
Epoch #2


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[А] vs [8П8Т]
[А55] vs [ЦФЗИО5]
[Я] vs [Ж43Ц]
[] vs [ПЧ9]
[Ж] vs [Л8-М]
Epoch: 22 | Epoch Time: 1m 6s
	 Val. recognize loss: 0.643
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.948
Epoch #3


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[ФП] vs [ЯХЗЧОС]
[5] vs [5-Э4]
[Ф] vs [ФФВ]
[55] vs [1З5]
Epoch: 23 | Epoch Time: 1m 5s
	 Val. recognize loss: 0.673
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.953
Epoch #4


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[НЯ] vs [Ж8З]
[АА] vs [27ВКЭ]
Epoch: 24 | Epoch Time: 1m 5s
	 Val. recognize loss: 0.664
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.963
Epoch #5


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[Н] vs [7ПП]
[Ф] vs [5М]
[Ф] vs [И.НМ]
[ФП] vs [Н0Я0Ц]
[Ф] vs [МКШ4]
[Г] vs [8ДГЦ]
[8] vs [Х766З8]
[А] vs [Я8К-1.]
Epoch: 25 | Epoch Time: 1m 4s
	 Val. recognize loss: 0.670
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.965
Epoch #6


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[ФП] vs [ЛЗ6КЕ]
[Ф] vs [НЯ56]
[ПА] vs [.ВВ18]
[А] vs [ФЦ6К.]
[А] vs [9М-Л]
Epoch: 26 | Epoch Time: 1m 5s
	 Val. recognize loss: 0.673
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.947
Epoch #7


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[Ф] vs [Г6ШЧ]
[А] vs [СТХБ]
[А] vs [ИШ56ХП]
[ФФ] vs [5ГРВТ]
[] vs [ГАЕ]
Epoch: 27 | Epoch Time: 1m 5s
	 Val. recognize loss: 0.630
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.967
Epoch #8


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

Evaluating levenshtein:   0%|          | 0/7 [00:00<?, ?it/s]

[Ч] vs [Г.]
[ФЯ] vs [ГХШ Р]
[АА] vs [ПАГТА]
[Ф] vs [ЭГЭ49В]
Epoch: 28 | Epoch Time: 1m 6s
	 Val. recognize loss: 0.664
	 Val. Levenshein distance: 4.140
	 Val. Cer: 0.950
Epoch #9


Training:   0%|          | 0/32 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/7 [00:00<?, ?it/s]

# THE END

In [None]:
def reqognize(crop):
    pixel_values = processor(crop, return_tensors="pt").pixel_values.to(recognize_model.device)
    generated_ids = recognize_model.generate(pixel_values)
    return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]


def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    #cer = cer_metric.compute(predictions=pred_str, references=label_str)
    cer = pylev.distance(pred_str, label_str)

    return {"cer": cer}

In [None]:
batch_size = 32  # сколько примеров показывем модели за один шаг
report_steps = 200  # раз в сколько шагов печатаем результат
epochs = 200  # сколько раз мы покажем данные модели
early_stop_patience = -1
lr = 5e-05 / 10
lr_decay = 0.95  # 0.8
device = 'cuda'

In [None]:
pip install --upgrade accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.19.0


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True, 
    output_dir="./",
    logging_steps=2,
    save_steps=10000,
    eval_steps=200,
    learning_rate=lr,
    num_train_epochs=epochs,
    max_steps=-1,
)

NameError: ignored

In [None]:
cleanup()
set_random_seed()

recognize_model.to(device)

In [None]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=recognize_model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=default_data_collator,
)