In [2]:
import os
from glob import glob

def load_images(root_dir):
    image_list = []
    
    for folder_name in sorted(os.listdir(root_dir)):
        folder_path = os.path.join(root_dir, folder_name)
        if not os.path.isdir(folder_path):
            continue
        
        label = folder_name.replace('_', '/')
        
        image_paths = glob(os.path.join(folder_path, '*.*'))
        image_paths = image_paths[:50]
        
        for img_path in image_paths:
            image_list.append({
                "img_path":img_path,
                "label":label
            })
            
    return image_list


root_dir = "dataset\output_frames"
image_label_list = load_images(root_dir)

print(len(image_label_list))

3250


  root_dir = "dataset\output_frames"


In [16]:
import torch
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

# 1. 모델 및 프로세서 준비
trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed").to('cuda' if torch.cuda.is_available() else 'cpu')

def predict_trocr(image_path):
    image = Image.open(image_path).convert("RGB")
    pixel_values = trocr_processor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(trocr_model.device)
    generated_ids = trocr_model.generate(pixel_values)
    pred = trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return pred.strip()


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_mod

In [None]:
!pip uninstall torch

In [None]:
print(1)

In [None]:
import torch
print(torch.__version__)
print(torch.backends)


AttributeError: module 'torch' has no attribute '__version__'

In [1]:
import easyocr
easyocr_reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())


def predict_easyocr(image_path):
    results = easyocr_reader.readtext(image_path, detail=0)
    if results:
        return results[0].strip()
    else:
        return ''

ModuleNotFoundError: No module named 'torch.backends'

In [12]:
import numpy as np
import editdistance

def cer(s1, s2):
    # 문자단위 Levenshtein 거리
    return editdistance.eval(s1, s2) / max(len(s2), 1)

def wer(ref, hyp):
    # 단어단위 Levenshtein 거리
    ref_words = ref.split()
    hyp_words = hyp.split()
    import editdistance
    return editdistance.eval(hyp_words, ref_words) / max(len(ref_words), 1)

In [28]:
all_results = []

for item in image_label_list:
    img_path = item["img_path"]
    gt_label = item["label"]

    # 추론
    pred_trocr = predict_trocr(img_path)

    # 평가
    cer_t = cer(pred_trocr, gt_label)
    wer_t = wer(pred_trocr, gt_label)

    all_results.append({
        "img": img_path,
        "gt": gt_label,
        "trocr_pred": pred_trocr,
        "cer_trocr": cer_t,
        "wer_trocr": wer_t,
    })


# 평균 CER/WER 계산
trocr_cer = np.mean([r['cer_trocr'] for r in all_results])
trocr_wer = np.mean([r['wer_trocr'] for r in all_results])

print(f"TROCR CER: {trocr_cer:.4f} / WER: {trocr_wer:.4f}")

TROCR CER: 0.0000 / WER: 0.0000


In [None]:
all_results = []

for item in image_label_list:
    img_path = item["img_path"]
    gt_label = item["label"]

    # 추론
    pred_easyocr = predict_easyocr(img_path)

    # 평가
    cer_e = cer(pred_easyocr, gt_label)
    wer_e = wer(pred_easyocr, gt_label)

    all_results.append({
        "img": img_path,
        "gt": gt_label,
        "easyocr_pred": pred_easyocr,
        "cer_easyocr": cer_e,
        "wer_easyocr": wer_e
    })

easyocr_cer = np.mean([r['cer_easyocr'] for r in all_results])
easyocr_wer = np.mean([r['wer_easyocr'] for r in all_results])

print(f"EASYOCR CER: {easyocr_cer:.4f} / WER: {easyocr_wer:.4f}")