# OCR Model Comparison: Base vs Finetuned

This notebook evaluates and compares the performance of the base **PaddleOCR-VL** model against the **Tachiwin-BF16** finetuned version on a subset of the `tachiwin/multilingual_ocr_llm` dataset.

In [None]:
!pip install -q transformers torch Pillow datasets tqdm

In [None]:
import json
import torch
from PIL import Image
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoProcessor
from tqdm.auto import tqdm
import gc

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

## 1. Metric Calculations
We use Levenshtein distance to calculate Character Error Rate (CER) and Word Error Rate (WER).

In [None]:
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

def calculate_cer(reference, hypothesis):
    if not reference:
        return 1.0 if hypothesis else 0.0
    distance = levenshtein_distance(reference, hypothesis)
    return distance / len(reference)

def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    if not ref_words:
        return 1.0 if hyp_words else 0.0
    distance = levenshtein_distance(ref_words, hyp_words)
    return distance / len(ref_words)

## 2. Dataset Loading
Loading the first 2000 samples from the test split of `tachiwin/multilingual_ocr_llm`.

In [None]:
print("Loading dataset...")
ds = load_dataset("tachiwin/multilingual_ocr_llm", split="test", streaming=True)

# Take first 2000 samples
subset = []
for i, sample in tqdm(enumerate(ds), total=2000, desc="Fetching samples"):
    if i >= 2000:
        break
    subset.append(sample)

print(f"Loaded {len(subset)} samples.")

## 3. Inference Logic

In [None]:
def run_inference(model, processor, image):
    messages = [
        {"role": "user",         
         "content": [
                {"type": "image", "image": image.convert("RGB")},
                {"type": "text", "text": "OCR:"},
            ]
        }
    ]
    inputs = processor.apply_chat_template(
        messages, 
        tokenize=True, 
        add_generation_prompt=True, 	
        return_dict=True,
        return_tensors="pt"
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=1024)
    
    decoded = processor.batch_decode(outputs, skip_special_tokens=True)[0]
    return decoded

## 4. Base Model Inference
Model: `PaddlePaddle/PaddleOCR-VL`

In [None]:
model_id_base = "PaddlePaddle/PaddleOCR-VL"
print(f"Loading base model: {model_id_base}")

model = AutoModelForCausalLM.from_pretrained(
    model_id_base, trust_remote_code=True, torch_dtype=torch.bfloat16
).to(DEVICE).eval()
processor = AutoProcessor.from_pretrained(model_id_base, trust_remote_code=True)

base_results = []
for i, sample in tqdm(enumerate(subset), total=len(subset), desc="Base Model Inference"):
    image = sample["image"]
    output = run_inference(model, processor, image)
    base_results.append(output)
    if i % 100 == 0:
        print(f"Sample {i} | Output: {output[:100]}...")

# Cleanup to save memory
del model
del processor
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## 5. Finetuned Model Inference
Model: `tachiwin/PaddleOCR-VL-Tachiwin-BF16`

In [None]:
model_id_ft = "tachiwin/PaddleOCR-VL-Tachiwin-BF16"
print(f"Loading finetuned model: {model_id_ft}")

model = AutoModelForCausalLM.from_pretrained(
    model_id_ft, trust_remote_code=True, torch_dtype=torch.bfloat16
).to(DEVICE).eval()
processor = AutoProcessor.from_pretrained(model_id_ft, trust_remote_code=True)

ft_results = []
for i, sample in tqdm(enumerate(subset), total=len(subset), desc="Finetuned Model Inference"):
    image = sample["image"]
    output = run_inference(model, processor, image)
    ft_results.append(output)
    if i % 100 == 0:
        print(f"Sample {i} | Output: {output[:100]}...")

# Cleanup
del model
del processor
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## 6. Evaluation and Results Comparison

We compare the results and print metrics similar to `ocr_evaluator.py`.

In [None]:
comparison_data = []
total_raw_cer = 0
total_ft_cer = 0
total_raw_wer = 0
total_ft_wer = 0
count = len(subset)

print(f"{'ID':<15} | {'Raw CER':<9} | {'FT CER':<9} | {'Raw WER':<9} | {'FT WER':<9} | {'Improvement'}")
print("-" * 85)

for i, sample in enumerate(subset):
    gt = sample["text"]
    raw = base_results[i]
    ft = ft_results[i]
    lang = sample.get("language", "unk")
    
    raw_cer = calculate_cer(gt, raw)
    ft_cer = calculate_cer(gt, ft)
    raw_wer = calculate_wer(gt, raw)
    ft_wer = calculate_wer(gt, ft)
    
    comparison_data.append({
        "id": sample["id"],
        "language": lang,
        "ground_truth": gt,
        "raw": raw,
        "finetuned": ft
    })
    
    total_raw_cer += raw_cer
    total_ft_cer += ft_cer
    total_raw_wer += raw_wer
    total_ft_wer += ft_wer
    
    improvement = raw_cer - ft_cer
    # Print sampled results to avoid cluttering the notebook
    if count <= 50 or i % (count // 20) == 0 or i == count - 1:
        print(f"{sample['id']:<15} | {raw_cer:>8.2%} | {ft_cer:>8.2%} | {raw_wer:>8.2%} | {ft_wer:>8.2%} | {improvement:>+10.2%}")

avg_raw_cer = total_raw_cer / count
avg_ft_cer = total_ft_cer / count
avg_raw_wer = total_raw_wer / count
avg_ft_wer = total_ft_wer / count

print("-" * 85)
print(f"{'AVERAGE':<15} | {avg_raw_cer:>8.2%} | {avg_ft_cer:>8.2%} | {avg_raw_wer:>8.2%} | {avg_ft_wer:>8.2%} | {avg_raw_cer - avg_ft_cer:>+10.2%}")

print("\n--- Summary ---")
print(f"Overall Raw Accuracy (1-CER): {1 - avg_raw_cer:.2%}")
print(f"Overall Finetuned Accuracy (1-CER): {1 - avg_ft_cer:.2%}")
print(f"Standard Error Reduction: {(avg_raw_cer - avg_ft_cer) / avg_raw_cer if avg_raw_cer > 0 else 0:.2%}")

## 7. Saving and Exporting Results

In [None]:
output_file = "comparison_results.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(comparison_data, f, ensure_ascii=False, indent=2)

print(f"Results saved to {output_file}")

# Code to download manually if in Colab
try:
    from google.colab import files
    files.download(output_file)
except ImportError:
    print("Manual download required if not in Colab.")