In [2]:
import pytesseract
from PIL import Image
from nltk.translate.bleu_score import sentence_bleu
from jiwer import wer
from transformers import MarianMTModel, MarianTokenizer
import string

# Function to calculate Word Error Rate (WER)
def calculate_wer(reference, hypothesis):
    return wer(reference, hypothesis)

# Function to calculate BLEU Score
def calculate_bleu(reference, hypothesis):
    reference = [reference.split()]  # BLEU expects a list of tokenized reference sentences
    hypothesis = hypothesis.split()  # Tokenize the hypothesis
    return sentence_bleu(reference, hypothesis)

# Function to normalize text (lowercase, remove spaces, and remove punctuation)
def normalize_text(text):
    """
    Normalize the text by converting to lowercase, removing punctuation,
    and collapsing extra spaces.
    """
    # Convert to lowercase
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra spaces (normalize spaces to a single space)
    text = ' '.join(text.split())

    return text

# OCR extraction function
def ocr_extraction(image_path):
    """
    Perform OCR on the input image and return the extracted text.
    """
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

# Load translation model and tokenizer (English to French)
def load_translation_model():
    model_name = "Helsinki-NLP/opus-mt-en-fr"
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Function to translate text using the MarianMT model
def translate_text(model, tokenizer, text):
    """
    Translate text from English to French using the MarianMT model.
    """
    # Prepare text for translation
    translated = model.generate(**tokenizer.prepare_seq2seq_batch([text], return_tensors="pt"))
    translation = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translation

# OCR evaluation (WER only)
def evaluate_ocr(ocr_output, ground_truth_ocr):
    """
    Evaluate OCR output using WER (Word Error Rate).
    """
    ocr_output = normalize_text(ocr_output)
    ground_truth_ocr = normalize_text(ground_truth_ocr)
    wer_score = calculate_wer(ground_truth_ocr, ocr_output)
    return wer_score

# Translation evaluation (BLEU only)
def evaluate_translation(ocr_output, ground_truth_translation, model, tokenizer):
    """
    Evaluate translation output using BLEU score.
    """
    # Normalize the OCR output and translation ground truth
    normalized_ocr = normalize_text(ocr_output)
    normalized_translation = normalize_text(ground_truth_translation)

    # Translate the OCR text
    translated_text = translate_text(model, tokenizer, normalized_ocr)
    print(f"Translated Text: {translated_text}")

    # Tokenize and calculate BLEU score
    translated_text = ' '.join(translated_text.split())  # Normalize spacing
    ground_truth_translation = ' '.join(normalized_translation.split())  # Normalize spacing
    bleu_score = calculate_bleu(ground_truth_translation, translated_text)
    return bleu_score


def process_image_for_evaluation(image_path, ground_truth_ocr, ground_truth_translation):
    # Extract text using OCR
    ocr_text = ocr_extraction(image_path)
    print(f"Extracted OCR Text: {ocr_text}")

    # Evaluate OCR accuracy using WER (Word Error Rate)
    wer_score = evaluate_ocr(ocr_text, ground_truth_ocr)
    print(f"OCR WER: {wer_score:.4f}")

    # Load translation model
    model, tokenizer = load_translation_model()

    # Evaluate translation accuracy using BLEU score
    bleu_score = evaluate_translation(ocr_text, ground_truth_translation, model, tokenizer)
    print(f"Translation BLEU: {bleu_score:.4f}")


image_path = "ocr.png"  # Replace with your  image path
ground_truth_ocr ="This text is easy to extract." #add groundtruth text
ground_truth_french ="Ce texte est facile à extraire." #add groundtruth french translation
process_image_for_evaluation(image_path, ground_truth_ocr, ground_truth_french)


Extracted OCR Text: This text is
easy to extract.

OCR WER: 0.0000


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Translated Text: ce texte est facile à extraire
Translation BLEU: 1.0000
