In [2]:
import os
import torch
import logging
import numpy as np
from zipfile import ZipFile
from BudaOCR.Config import N_CHARSET
from huggingface_hub import snapshot_download
from BudaOCR.Modules import CRNNNetwork, OCRTrainer, WylieEncoder, StackEncoder
from BudaOCR.Utils import shuffle_data, create_dir, build_data_paths, build_distribution_from_file, read_stack_file
logging.getLogger().setLevel(logging.INFO)

torch.cuda.empty_cache()
print(torch.__version__)

2.3.1+cu118


In [None]:
charset = N_CHARSET
wylie_encoder = WylieEncoder(N_CHARSET)

stack_file = f"tib-stacks.txt"
stacks = read_stack_file(stack_file)
stack_encoder = StackEncoder(stacks)

print(stack_encoder.num_classes())
print(wylie_encoder.num_classes())

In [None]:
data_path = snapshot_download(repo_id="BDRC/Karmapa8", repo_type="dataset",  cache_dir="Datasets")

with ZipFile(f"{data_path}/data.zip", 'r') as zip:
    zip.extractall(f"{data_path}/Dataset")

dataset_path = f"{data_path}/Dataset"
image_paths, label_paths = build_data_paths(dataset_path)
image_paths, label_paths = shuffle_data(image_paths, label_paths)

print(f"Images: {len(image_paths)}, Labels: {len(label_paths)}")

output_dir = os.path.join("Output")
create_dir(output_dir)

In [None]:
output_dir = os.path.join(dataset_path, "Output")
create_dir(output_dir)

image_width = 3200
image_height = 80
encoder = stack_encoder
num_classes = encoder.num_classes()

network = CRNNNetwork(image_width=image_width, image_height=image_height, num_classes=num_classes)
workers = 4

ocr_trainer = OCRTrainer(
    network=network,
    label_encoder=wylie_encoder,
    workers=workers, 
    image_width=image_width,
    image_height=image_height,
    batch_size=32, 
    output_dir=output_dir, 
    preload_labels=True
    )

ocr_trainer.init(image_paths, label_paths)

In [None]:
ocr_trainer.train(epochs=40, scheduler_start=32, check_cer=True)

In [None]:
cer_scores = ocr_trainer.evaluate()

print(f"Mean CER: {np.mean(cer_scores)}")
print(f"Max CER: {np.max(cer_scores)}")
print(f"Min CER: {np.min(cer_scores)}")

#### Train from fixed Distribution

In [None]:
dataset_path = "E:/Datasets/OCR/DbuMed/NEW/Drutsa-Complete/batch27"
latest_chkpt_dir = f"{dataset_path}/Output/2024_9_24_15_19"
distr_file = f"{latest_chkpt_dir}/data.distribution"

distribution = build_distribution_from_file(distr_file, dataset_path)
#chkpt_path = "E:/Datasets/OCR/DbuMed/NEW/Drutsa-Batch31/batch31/Output/2024_9_23_21_49/OCRModel.pth"
output_dir = os.path.join(dataset_path, "Output")
create_dir(output_dir)

image_width = 3200
image_height = 100
encoder = wylie_encoder
num_classes = encoder.num_classes()

network = CRNNNetwork(num_classes=num_classes, image_width=image_width, image_height=image_height)
#network.load_checkpoint(chkpt_path)

workers = 4

ocr_trainer = OCRTrainer(
    network=network,
    label_encoder=encoder,
    workers=workers, 
    image_width=image_width,
    image_height=image_height,
    batch_size=32, 
    output_dir=output_dir, 
    preload_labels=True
    )

ocr_trainer.init_from_distribution(distribution)

In [None]:
ocr_trainer.train(epochs=40, check_cer=True, export_onnx=True)

#### Evaluate on Test set

In [8]:
cer_scores = ocr_trainer.evaluate()

print(f"Mean CER: {np.mean(cer_scores)}")
print(f"Max CER: {np.max(cer_scores)}")
print(f"Min CER: {np.min(cer_scores)}")