<a href="https://colab.research.google.com/github/moksha-hub/RenAIssance_OCR/blob/main/RenAIssance_by_mokshagna.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!sudo apt-get -y install tree


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tree is already the newest version (2.0.2-1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [3]:

import shutil

dataset_dir = "/content/dataset"

# Recursively delete the dataset folder
shutil.rmtree(dataset_dir, ignore_errors=True)

print(f"Removed '{dataset_dir}' and all its contents.")


Removed '/content/dataset' and all its contents.


In [4]:
#############################################
# 0. (Optional) Mount Google Drive
#############################################
# If you have already mounted your Drive, you can comment out these lines:
from google.colab import drive
drive.mount('/content/drive')

#############################################
# 1. Install Dependencies
#############################################
!sudo apt-get -y install poppler-utils
!pip install pdf2image python-docx PyPDF2 opencv-python

#############################################
# 2. Imports and Utility Functions
#############################################
import os
import re
import gc
import cv2
import docx
import numpy as np
from PyPDF2 import PdfReader
from pdf2image import convert_from_path

def read_transcriptions_from_docx(docx_path):
    """
    Reads a .docx file and returns a string containing all paragraphs joined.
    """
    doc = docx.Document(docx_path)
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    return "\n".join(paragraphs)

def convert_pdf_to_images_single_page(pdf_path, output_folder, dpi=150):
    """
    Converts a large PDF to PNG images one page at a time, reducing memory usage.
    Default dpi=150 to further reduce memory usage. Adjust as needed.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder, exist_ok=True)

    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    print(f"[Single-Page Mode] Processing '{os.path.basename(pdf_path)}' with {total_pages} pages...")

    for page_num in range(1, total_pages + 1):
        pages = convert_from_path(
            pdf_path,
            dpi=dpi,
            first_page=page_num,
            last_page=page_num
        )
        page_image = pages[0]
        out_path = os.path.join(output_folder, f"page_{page_num}.png")
        page_image.save(out_path, "PNG")

        # Free memory for this page
        del page_image
        del pages
        gc.collect()

    print(f"Converted '{os.path.basename(pdf_path)}' into {total_pages} images in '{output_folder}'")

def convert_pdf_to_images_all_at_once(pdf_path, output_folder, dpi=300):
    """
    Converts all pages of a PDF in one pass. Higher DPI=300 for good quality.
    Use this for smaller PDFs to speed up processing.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder, exist_ok=True)

    pages = convert_from_path(pdf_path, dpi=dpi)
    for idx, page in enumerate(pages):
        out_path = os.path.join(output_folder, f"page_{idx+1}.png")
        page.save(out_path, "PNG")

    print(f"Converted '{os.path.basename(pdf_path)}' into {len(pages)} images in '{output_folder}'")

def segment_into_lines(page_image, threshold=10):
    """
    Splits a page image into multiple line images using a simple horizontal projection approach.
    Returns a list of cropped line images.
    threshold=10 means a row must have at least 10 black pixels to be considered text.
    Adjust as needed for your documents.
    """
    gray = cv2.cvtColor(page_image, cv2.COLOR_BGR2GRAY)
    # Otsu binarization
    _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    # Sum horizontally. If text is white on black, do: bw = 255 - bw
    horizontal_sum = np.sum(bw == 0, axis=1)

    in_line = False
    start_idx = 0
    line_indices = []

    for i, val in enumerate(horizontal_sum):
        if val > threshold and not in_line:
            in_line = True
            start_idx = i
        elif val <= threshold and in_line:
            in_line = False
            end_idx = i
            line_indices.append((start_idx, end_idx))

    line_images = []
    for (start, end) in line_indices:
        line_img = page_image[start:end, :]  # entire width
        line_images.append(line_img)
    return line_images

#############################################
# 3. Splitting Logic: PDF -> Page Images -> Line Images
#############################################
def split_page_images_into_lines(page_images_folder):
    """
    For each page image in page_images_folder, segment it into lines and store them in a 'lines/' subfolder.
    e.g. page_1.png -> lines/page_1_line_0.png, page_1_line_1.png, ...
    """
    lines_folder = os.path.join(page_images_folder, "lines")
    os.makedirs(lines_folder, exist_ok=True)

    for file_name in sorted(os.listdir(page_images_folder)):
        if file_name.lower().endswith((".png", ".jpg", ".jpeg")) and not file_name.startswith("line_"):
            page_path = os.path.join(page_images_folder, file_name)
            page_img = cv2.imread(page_path, cv2.IMREAD_COLOR)
            if page_img is None:
                continue

            lines = segment_into_lines(page_img, threshold=10)
            base_name = os.path.splitext(file_name)[0]
            for idx, line_img in enumerate(lines):
                line_path = os.path.join(lines_folder, f"{base_name}_line_{idx}.png")
                cv2.imwrite(line_path, line_img)
            print(f"Split '{file_name}' into {len(lines)} lines in '{lines_folder}'")

#############################################
# 4. Main Function to Create a Dataset with Further Splitting
#############################################
def create_dataset_with_line_splitting(
    sources_folder,
    transcriptions_folder,
    local_dataset_dir,
    size_threshold_mb=20,
    single_page_dpi=150,
    all_at_once_dpi=300
):
    """
    1) For each PDF in sources_folder:
       - Create folder in local_dataset_dir with same base name
       - Convert PDF to page images
       - Attempt to match .docx by base name overlap
       - Read docx, save to transcription.txt
       - Then further split each page image into lines in a lines/ subfolder
    """
    import docx

    if not os.path.exists(local_dataset_dir):
        os.makedirs(local_dataset_dir, exist_ok=True)

    docx_files = [f for f in os.listdir(transcriptions_folder) if f.lower().endswith(".docx")]

    def find_best_match_docx(base_name, docx_files):
        base_words = re.findall(r"\w+", base_name.lower())
        best_file = None
        best_score = 0
        for docx_file in docx_files:
            docx_stem = os.path.splitext(docx_file)[0].lower()
            docx_words = re.findall(r"\w+", docx_stem)
            overlap = len(set(base_words).intersection(docx_words))
            if overlap > best_score:
                best_score = overlap
                best_file = docx_file
        return best_file if best_score > 0 else None

    for file_name in os.listdir(sources_folder):
        if file_name.lower().endswith(".pdf"):
            base_name = os.path.splitext(file_name)[0]
            pdf_path = os.path.join(sources_folder, file_name)

            source_folder = os.path.join(local_dataset_dir, base_name)
            images_folder = os.path.join(source_folder, "images")
            os.makedirs(images_folder, exist_ok=True)

            pdf_size = os.path.getsize(pdf_path)
            if pdf_size > size_threshold_mb * 1024 * 1024:
                convert_pdf_to_images_single_page(pdf_path, images_folder, dpi=single_page_dpi)
            else:
                convert_pdf_to_images_all_at_once(pdf_path, images_folder, dpi=all_at_once_dpi)

            # find best matching docx
            matched_docx = find_best_match_docx(base_name, docx_files)
            if matched_docx:
                docx_path = os.path.join(transcriptions_folder, matched_docx)
                doc = docx.Document(docx_path)
                paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
                transcription_text = "\n".join(paragraphs)

                transcription_out_path = os.path.join(source_folder, "transcription.txt")
                with open(transcription_out_path, "w", encoding="utf-8") as txt_file:
                    txt_file.write(transcription_text)
                print(f"Transcription from '{matched_docx}' saved to '{transcription_out_path}'\n")
            else:
                print(f"No matching Word document found for '{file_name}'.\n")

            # Now further split each page image into lines
            split_page_images_into_lines(images_folder)

    print("\nFinal dataset structure in", local_dataset_dir)
    get_ipython().system(f"tree -L 3 {local_dataset_dir}")

#############################################
# 5. Example Usage
#############################################
sources_folder = "/content/drive/MyDrive/RenAI Printed img/Test sources"
transcriptions_folder = "/content/drive/MyDrive/RenAI Printed img/Test transcriptions"
local_dataset_dir = "/content/dataset_with_lines"

create_dataset_with_line_splitting(
    sources_folder,
    transcriptions_folder,
    local_dataset_dir,
    size_threshold_mb=20,
    single_page_dpi=150,
    all_at_once_dpi=300
)


Mounted at /content/drive
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 29 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 2s (118 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 78, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package poppler-utils.
(Re

In [5]:
!pip install paddlepaddle-gpu -f https://www.paddlepaddle.org.cn/whl/mkl/avx/stable.html


Looking in links: https://www.paddlepaddle.org.cn/whl/mkl/avx/stable.html
Collecting paddlepaddle-gpu
  Downloading paddlepaddle_gpu-2.6.2-cp311-cp311-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting astor (from paddlepaddle-gpu)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle-gpu)
  Downloading opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Downloading paddlepaddle_gpu-2.6.2-cp311-cp311-manylinux1_x86_64.whl (759.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m759.0/759.0 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading opt_einsum-3.3.0-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: opt-einsum, astor, paddlepaddle-gpu
  Attempting uninstall: opt-einsum
    Found existing installation: opt_einsum 3

In [6]:
#############################################
# 0. (Optional) Mount Google Drive
#############################################
from google.colab import drive
drive.mount('/content/drive')

#############################################
# 1. Install Dependencies
#############################################
!sudo apt-get -y install poppler-utils
!pip install pdf2image python-docx PyPDF2 opencv-python rapidfuzz paddleocr

#############################################
# 2. Imports and Utility Functions
#############################################
import os
import re
import gc
import cv2
import docx
import numpy as np
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
from rapidfuzz import fuzz

def read_transcriptions_from_txt(txt_path):
    with open(txt_path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def partial_ocr_line(line_image_path, ocr_engine):
    """
    Run PaddleOCR on a single line image, return recognized text (best guess).
    """
    result = ocr_engine.ocr(line_image_path, rec=True)
    if result and len(result[0]) > 0:
        # result[0][0] => bounding box, result[0][1] => (text, confidence)
        # But for a single line, there's typically 1 bounding box.
        recognized_text = result[0][0][1][0]
    else:
        recognized_text = ""
    return recognized_text

def find_best_line_match(recognized_text, doc_lines):
    """
    Fuzzy match recognized_text to each line in doc_lines, return best line and best score.
    """
    best_score = -1
    best_line = None
    for candidate_line in doc_lines:
        score = fuzz.ratio(recognized_text.lower(), candidate_line.lower())
        if score > best_score:
            best_score = score
            best_line = candidate_line
    return best_line, best_score

def generate_line_level_texts(
    dataset_root,
    paddle_ocr_lang="en"
):
    """
    1) For each <PDF_Name>/images/lines folder in dataset_root,
       read line images (page_1_line_0.png, etc.).
    2) Load the entire transcription.txt from <PDF_Name>/transcription.txt as doc_lines (split by lines).
    3) For each line image, run partial OCR, fuzzy match to doc_lines, and create .txt file.
    """
    ocr_engine = PaddleOCR(lang=paddle_ocr_lang, rec=True, det=False, use_angle_cls=False)

    for folder_name in sorted(os.listdir(dataset_root)):
        folder_path = os.path.join(dataset_root, folder_name)
        if not os.path.isdir(folder_path):
            continue

        # e.g. /content/dataset_with_lines/Mendo - Principe perfecto
        transcription_file = os.path.join(folder_path, "transcription.txt")
        images_folder = os.path.join(folder_path, "images")
        lines_folder = os.path.join(images_folder, "lines")

        if not os.path.exists(transcription_file):
            print(f"No transcription.txt in {folder_name}, skipping.")
            continue
        if not os.path.exists(lines_folder):
            print(f"No lines folder in {folder_name}, skipping.")
            continue

        doc_lines = read_transcriptions_from_txt(transcription_file)
        print(f"\n[{folder_name}] Found {len(doc_lines)} lines in transcription.txt")

        # For each line image in lines_folder, do partial OCR + fuzzy match
        for file_name in sorted(os.listdir(lines_folder)):
            if file_name.lower().endswith((".png", ".jpg", ".jpeg")) and not file_name.endswith(".txt"):
                line_img_path = os.path.join(lines_folder, file_name)
                recognized_text = partial_ocr_line(line_img_path, ocr_engine)

                if recognized_text.strip():
                    best_line, best_score = find_best_line_match(recognized_text, doc_lines)
                else:
                    best_line, best_score = "[NO RECOGNIZED TEXT]", 0

                # Save best_line to .txt
                txt_name = file_name.rsplit(".", 1)[0] + ".txt"
                txt_path = os.path.join(lines_folder, txt_name)
                with open(txt_path, "w", encoding="utf-8") as f:
                    f.write(best_line)

                print(f"  -> {file_name} recognized='{recognized_text[:30]}...' matched='{best_line[:30]}...' score={best_score}")
    print("\nLine-level .txt generation complete.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Collecting rapidfuzz
  Downloading rapidfuzz-3.12.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting paddleocr
  Downloading paddleocr-2.10.0-py3-none-any.whl.metadata (12 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Downloading lmdb-1.6.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting fire>=0.3.0 (from paddleocr)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [7]:
def partial_ocr_line(line_image_path, ocr_engine):
    """
    Run PaddleOCR on a single line image, return recognized text (best guess).
    Safely handles None or empty results.
    """
    result = ocr_engine.ocr(line_image_path, rec=True)

    # If result is None or empty, return empty string
    if not result or len(result) == 0:
        return ""

    # result is typically a list of lists, e.g. [ [ [box], [text, confidence] ], ... ]
    # So we check if result[0] is valid
    if not result[0] or len(result[0]) == 0:
        return ""

    # Finally, parse recognized text from result
    # Usually: result[0][0] => [ [box coords], [text, confidence] ]
    recognized_text = result[0][0][1][0]
    return recognized_text


In [8]:
ocr_engine = PaddleOCR(lang="es", rec=True, det=False, use_angle_cls=False)
line_img_path = "/content/dataset_with_lines/Mendo - Principe perfecto/images/lines/page_2_line_0.png"
recognized_text = partial_ocr_line(line_img_path, ocr_engine)
print("Recognized text:", recognized_text)


download https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar to /root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer/en_PP-OCRv3_det_infer.tar


100%|██████████| 3910/3910 [00:00<00:00, 4995.99it/s]


download https://paddleocr.bj.bcebos.com/PP-OCRv3/multilingual/latin_PP-OCRv3_rec_infer.tar to /root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer/latin_PP-OCRv3_rec_infer.tar


100%|██████████| 9930/9930 [00:00<00:00, 10647.21it/s]


download https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar to /root/.paddleocr/whl/cls/ch_ppocr_mobile_v2.0_cls_infer/ch_ppocr_mobile_v2.0_cls_infer.tar


100%|██████████| 2138/2138 [00:00<00:00, 3040.87it/s]

[2025/03/23 08:40:31] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_t




[2025/03/23 08:40:37] ppocr DEBUG: dt_boxes num : 19, elapsed : 1.26682710647583
[2025/03/23 08:40:38] ppocr DEBUG: rec_res num  : 19, elapsed : 0.2907557487487793
Recognized text: geado el ferlo : logrando dignamente en nueftro Monarcha la gracia


In [9]:
def partial_ocr_line(line_image_path, ocr_engine):
    result = ocr_engine.ocr(line_image_path, rec=True)
    if not result or len(result) == 0:
        print(f"WARNING: No OCR result for {line_image_path}")
        return ""
    if not result[0] or len(result[0]) == 0:
        print(f"WARNING: No bounding boxes recognized for {line_image_path}")
        return ""

    recognized_text = result[0][0][1][0]
    return recognized_text


In [10]:
dataset_root = "/content/dataset_with_lines"  # or your dataset path
generate_line_level_texts(dataset_root, paddle_ocr_lang="es")


[2025/03/23 08:40:38] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, use_gcu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/latin/latin_PP-OCRv3_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_t

In [11]:
#############################################
# 0. (Optional) Mount Google Drive
#############################################
from google.colab import drive
drive.mount('/content/drive')

#############################################
# 1. Install Dependencies
#############################################
!sudo apt-get -y install poppler-utils tree
!pip install paddleocr transformers peft albumentations PyPDF2 jiwer

#############################################
# 2. Imports and Utility Functions
#############################################
import os
import re
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image, ImageDraw, ImageFont

from paddleocr import PaddleOCR
from transformers import (
    TrOCRProcessor,
    VisionEncoderDecoderModel,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from torch.utils.data import Dataset, random_split, ConcatDataset
import albumentations as A
from albumentations.pytorch import ToTensorV2
from peft import AdaLoraConfig, get_peft_model
import torch.nn.functional as F

import logging
logging.getLogger("ppocr").setLevel(logging.ERROR)

# We'll still use jiwer for computing WER
import jiwer

#############################################
# Custom CER Function (Levenshtein Distance)
#############################################
def levenshtein_distance(s1, s2):
    m, n = len(s1), len(s2)
    dp = list(range(n + 1))
    for i in range(1, m + 1):
        prev = dp[0]
        dp[0] = i
        for j in range(1, n + 1):
            temp = dp[j]
            if s1[i - 1] == s2[j - 1]:
                dp[j] = prev
            else:
                dp[j] = 1 + min(prev, dp[j], dp[j - 1])
            prev = temp
    return dp[n]

def compute_cer_metric(references, hypotheses):
    total_distance = 0
    total_chars = 0
    for ref, hyp in zip(references, hypotheses):
        d = levenshtein_distance(ref, hyp)
        total_distance += d
        total_chars += len(ref)
    return total_distance / total_chars if total_chars > 0 else 0

#############################################
# 3. Utility Functions
#############################################
def normalize_text(text):
    text = text.replace('ç', 'z').replace('ſ', 's')
    text = re.sub(r'[ùúûüū]', 'u', text)
    text = re.sub(r'[àáâãā]', 'a', text)
    text = re.sub(r'(?<![n])́', '', text)
    text = re.sub(r'[̀̀̈]', '', text)
    return text.lower()[:512]

def read_line_text(txt_path):
    with open(txt_path, "r", encoding="utf-8") as f:
        return f.read().strip()

def advanced_preprocess(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, bw = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    denoised = cv2.fastNlMeansDenoising(bw, None, h=30, templateWindowSize=7, searchWindowSize=21)
    coords = np.column_stack(np.where(denoised > 0))
    if coords.size == 0:
        return image
    angle = cv2.minAreaRect(coords)[-1]
    angle = -(90 + angle) if angle < -45 else -angle
    (h, w) = denoised.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    deskewed = cv2.warpAffine(denoised, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return cv2.cvtColor(deskewed, cv2.COLOR_GRAY2RGB)

#############################################
# 4. Define Line-Level Dataset with Augmentation
#############################################
class LineLevelDataset(Dataset):
    def __init__(self, lines_root, processor, synthetic_prob=0.0):
        self.lines_root = lines_root
        self.processor = processor
        self.synthetic_prob = synthetic_prob
        self.samples = []
        for file_name in sorted(os.listdir(self.lines_root)):
            if file_name.lower().endswith((".png", ".jpg", ".jpeg")):
                base_name = file_name.rsplit(".", 1)[0]
                txt_name = base_name + ".txt"
                txt_path = os.path.join(self.lines_root, txt_name)
                if os.path.exists(txt_path):
                    self.samples.append((file_name, txt_name))
        # Moderate augmentation pipeline
        self.transform = A.Compose([
            A.Resize(height=384, width=384, always_apply=True),
            A.OneOf([
                A.GaussianBlur(blur_limit=(3, 7), p=0.3),
                A.MedianBlur(blur_limit=3, p=0.3)
            ], p=0.3),
            A.RandomBrightnessContrast(p=0.3),
            A.Rotate(limit=3, p=0.4),
            A.Perspective(scale=(0.02, 0.05), keep_size=True, p=0.2),
            A.ElasticTransform(alpha=1, sigma=50, alpha_affine=50, p=0.1),
            A.CoarseDropout(max_holes=8, max_height=16, max_width=16, p=0.1),
            A.ShiftScaleRotate(shift_limit=0.02, scale_limit=0.02, rotate_limit=3, p=0.3),
            ToTensorV2()
        ])

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        img_file, txt_file = self.samples[idx]
        line_img_path = os.path.join(self.lines_root, img_file)
        line_txt_path = os.path.join(self.lines_root, txt_file)
        text = read_line_text(line_txt_path)
        # Use real image (optionally, you can apply advanced_preprocess)
        img = cv2.imread(line_img_path, cv2.IMREAD_COLOR)
        if img is None:
            raise ValueError(f"Unable to load line image: {line_img_path}")
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        pil_img = Image.fromarray(img)
        aug = self.transform(image=np.array(pil_img))['image']
        aug_np = aug.permute(1, 2, 0).mul(255).byte().numpy()
        pil_img = Image.fromarray(aug_np)
        pixel_values = self.processor(pil_img, return_tensors="pt").pixel_values.squeeze(0)
        text_encoding = self.processor.tokenizer(
            text, return_tensors="pt", padding="max_length", truncation=True, max_length=128
        )
        labels = text_encoding["input_ids"][0]
        return {"pixel_values": pixel_values, "labels": labels}

#############################################
# 5. Gather All Line Folders from Dataset Root
#############################################
def gather_all_line_folders(dataset_root):
    lines_paths = []
    for folder_name in sorted(os.listdir(dataset_root)):
        folder_path = os.path.join(dataset_root, folder_name)
        if not os.path.isdir(folder_path):
            continue
        images_folder = os.path.join(folder_path, "images")
        lines_folder = os.path.join(images_folder, "lines")
        if os.path.exists(lines_folder):
            lines_paths.append(lines_folder)
    return lines_paths

#############################################
# 6. EMA Callback for Improved Generalization
#############################################
class EMACallback(TrainerCallback):
    def __init__(self, decay=0.999):
        self.decay = decay
        self.ema_weights = {}

    def on_train_begin(self, args, state, control, model=None, **kwargs):
        # Initialize EMA weights
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.ema_weights[name] = param.data.clone()
        return control

    def on_step_end(self, args, state, control, model=None, **kwargs):
        # Update EMA weights after each step
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.ema_weights[name].mul_(self.decay).add_(param.data, alpha=1 - self.decay)
        return control

    def on_evaluate(self, args, state, control, model=None, **kwargs):
        # Backup current weights and use EMA weights for evaluation
        backup = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                backup[name] = param.data.clone()
                param.data.copy_(self.ema_weights[name])
        control.ema_backup = backup
        return control

    def on_evaluate_end(self, args, state, control, model=None, **kwargs):
        # Restore original weights after evaluation
        if hasattr(control, "ema_backup"):
            for name, param in model.named_parameters():
                if param.requires_grad and name in control.ema_backup:
                    param.data.copy_(control.ema_backup[name])
            del control.ema_backup
        return control

#############################################
# 7. Training Function with Generalization Techniques
#############################################
def train_line_level_model(dataset_root):
    line_folders = gather_all_line_folders(dataset_root)
    if not line_folders:
        print("No line folders found, exiting.")
        return
    processor = TrOCRProcessor.from_pretrained("qantev/trocr-base-spanish", do_resize=False, do_normalize=False)
    line_datasets = [LineLevelDataset(lf, processor, synthetic_prob=0.0) for lf in line_folders]
    full_line_dataset = ConcatDataset(line_datasets)
    print(f"Total line images loaded: {len(full_line_dataset)}")
    train_size = int(0.9 * len(full_line_dataset))
    eval_size = len(full_line_dataset) - train_size
    train_dataset, eval_dataset = random_split(full_line_dataset, [train_size, eval_size])
    print(f"Training samples: {len(train_dataset)}; Evaluation samples: {len(eval_dataset)}")

    # Fine-tune the full model (do not freeze encoder)
    model = VisionEncoderDecoderModel.from_pretrained("qantev/trocr-base-spanish")
    model.config.num_beams = 5
    model.config.early_stopping = True

    # Set up AdaLoRA on the decoder (to keep training lightweight)
    target_modules_list = []
    for name, module in model.decoder.named_modules():
        if ("self_attn.q_proj" in name) or ("self_attn.k_proj" in name) or ("self_attn.v_proj" in name):
            target_modules_list.append(name)
    if not target_modules_list:
        raise AttributeError("No target modules found for AdaLoRA in the decoder.")
    peft_config = AdaLoraConfig(
        target_modules=target_modules_list, init_r=12, lora_alpha=32, lora_dropout=0.1, bias="none"
    )
    model.decoder = get_peft_model(model.decoder, peft_config)

    def print_trainable_parameters(m):
        trainable = sum(p.numel() for p in m.parameters() if p.requires_grad)
        total = sum(p.numel() for p in m.parameters())
        print(f"Trainable parameters: {trainable} / Total parameters: {total}")
    print_trainable_parameters(model)

    training_args = TrainingArguments(
        output_dir="./linelevel_trocr",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=30,
        learning_rate=5e-5,
        fp16=True,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir='./logs_line',
        logging_steps=50,
        report_to="none",
        dataloader_drop_last=False,
        remove_unused_columns=True,
        label_smoothing_factor=0.1,
        lr_scheduler_type="cosine",
        warmup_steps=500,
        weight_decay=0.01,
        gradient_accumulation_steps=2,
        max_grad_norm=1.0  # Gradient clipping to stabilize training
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, (tuple, list)):
            logits = logits[0]
        pred_ids = np.argmax(logits, axis=-1)
        preds = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
        refs = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
        preds = [normalize_text(s) for s in preds]
        refs = [normalize_text(s) for s in refs]
        cer = compute_cer_metric(refs, preds)
        wer = jiwer.wer(refs, preds)
        return {"cer": cer, "wer": wer}

    def collate_fn(batch):
        max_height = max(item["pixel_values"].shape[1] for item in batch)
        max_width = max(item["pixel_values"].shape[2] for item in batch)
        padded_pixel_values = []
        for item in batch:
            pv = item["pixel_values"]
            c, h, w = pv.shape
            padded = torch.zeros((c, max_height, max_width), dtype=pv.dtype)
            padded[:, :h, :w] = pv
            padded_pixel_values.append(padded)
        collated_labels = torch.stack([item["labels"] for item in batch])
        return {"pixel_values": torch.stack(padded_pixel_values), "labels": collated_labels}

    from transformers import Trainer
    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
            labels = inputs.pop("labels")
            pixel_values = inputs.pop("pixel_values")
            outputs = model(pixel_values=pixel_values, labels=labels)
            logits = outputs.logits
            vocab_size = logits.size(-1)
            logits = logits.view(-1, vocab_size)
            labels = labels.view(-1)
            valid_mask = labels != -100
            logits = logits[valid_mask]
            labels = labels[valid_mask]
            loss = F.cross_entropy(logits, labels)
            return (loss, outputs) if return_outputs else loss

    # Use EMA callback for better generalization
    ema_callback = EMACallback(decay=0.999)

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=collate_fn,
        compute_metrics=compute_metrics,
        callbacks=[ema_callback]
    )

    print("Starting line-level training...")
    trainer.train()
    torch.cuda.empty_cache()
    model.save_pretrained("./final_linelevel_model")
    processor.save_pretrained("./final_linelevel_model")
    print("Training complete. Final model saved to './final_linelevel_model'.")

#############################################
# 8. Putting It All Together
#############################################
def main():
    dataset_root = "/content/dataset_with_lines"  # Update this to your dataset root
    train_line_level_model(dataset_root)

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tree is already the newest version (2.0.2-1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Collecting jiwer
  Downloading jiwer-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collectin

preprocessor_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

Total line images loaded: 736
Training samples: 662; Evaluation samples: 74


config.json:   0%|          | 0.00/4.93k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decode

generation_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

Trainable parameters: 87538608 / Total parameters: 385749972
Starting line-level training...


Epoch,Training Loss,Validation Loss,Cer,Wer
1,25.7423,0.520738,0.836028,1.006734
2,2.4022,0.228227,0.371824,0.616162
3,7.4568,0.22381,0.386836,0.612795
4,4.3354,0.204765,0.354503,0.56229
5,0.7699,0.202513,0.356813,0.558923
6,2.1932,0.202339,0.3597,0.582492
7,0.9651,0.188201,0.364319,0.52862
8,0.6835,0.201656,0.3597,0.579125
9,0.9479,0.203024,0.36836,0.585859
10,0.776,0.19431,0.360277,0.552189


SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

In [None]:
from google.colab import drive
drive.mount('/content/drive')