In [8]:
import os
import numpy as np
from PIL import Image, ImageOps
from transformers import TrOCRProcessor, VisionEncoderDecoderModel

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-large-handwritten')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-large-handwritten')

def image_to_text(image):
    pixel_values = processor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

def find_ink_bounding_box(image, threshold=200):
    gray_image = ImageOps.grayscale(image)
    binary_image = gray_image.point(lambda p: p < threshold and 255)  # Black for ink, white for background

    ink_pixels = np.where(np.array(binary_image) == 0)
    min_x, max_x = np.min(ink_pixels[1]), np.max(ink_pixels[1])
    min_y, max_y = np.min(ink_pixels[0]), np.max(ink_pixels[0])

    return (min_x, min_y, max_x, max_y)

data_folder = "data"
allowed_extensions = ['.jpg', '.jpeg', '.png']

for file in os.listdir(data_folder):
    file_path = os.path.join(data_folder, file)
    file_name, file_extension = os.path.splitext(file)

    if file_extension.lower() in allowed_extensions:
        image = Image.open(file_path).convert("RGB")
        ink_bounding_box = find_ink_bounding_box(image)
        cropped_image = image.crop(ink_bounding_box)

        text = image_to_text(cropped_image)
        print(f"Digital text for {file}: {text}")

        output_file = f"{file_name}_output.txt"
        with open(output_file, "w") as f:
            f.write(text)
        print(f"Text saved to {output_file}\n")


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-large-handwritten and are newly initialized: ['encoder.pooler.dense.weight', 'encoder.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Digital text for testphoto2.jpeg: 1961 62
Text saved to testphoto2_output.txt



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Digital text for testphoto4.jpeg: What is your name?
Text saved to testphoto4_output.txt



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Digital text for testphoto.jpeg: 1961 62
Text saved to testphoto_output.txt



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Digital text for testphoto3.jpeg: 0 0
Text saved to testphoto3_output.txt

Digital text for testphoto5.jpeg: that is your name?
Text saved to testphoto5_output.txt

