In [1]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import json
import gzip

def load_ocronos(model_id="PleIAs/OCRonos-Vintage"):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
    return tokenizer, model, device

def correct_text(text, tokenizer, model, device, max_new_tokens=200):
    """
    Run OCRonos to correct noisy text.
    """
    # Some OCRonos checkpoints expect a format like "### Text ### ... ### Correction ###"
    prompt = f"### Text ###\n{text}\n\n### Correction ###\n"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False  # greedy decoding for stability
        )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Strip prompt to get only corrected portion
    if "### Correction ###" in result:
        result = result.split("### Correction ###")[-1].strip()
    return result



In [2]:
CORPUS_PATH = "Data/ppa_corpus_2025-02-03_1308/ppa_pages.jsonl.gz"
def page_iter(pages_file):
   # Yield pages one at a time from gzipped JSON lines file for memory efficiency
   with gzip.open(pages_file, 'rt', encoding='utf-8') as fh:
       for line in fh:
           yield json.loads(line)


            # e.g., {"ballad", "sonnet"} or None for all

TARGET_COLLECTIONS = {"Literary", "Linguistic"}
EXCLUSION_COLLECTIONS = {"Dictionary", "Word Lists", "Typographically Unique"}

with open("Data/ppa_corpus_2025-02-03_1308/ppa_metadata.json") as f:
    metadata = json.load(f)

# index by work_id, but keep full entry
metadata_index = {
    entry["work_id"]: entry
    for entry in metadata
    if "collections" in entry
    and any(c in TARGET_COLLECTIONS for c in entry["collections"])
    and not any(c in EXCLUSION_COLLECTIONS for c in entry["collections"])
}


In [3]:
tokenizer, model, device = load_ocronos()

sample_pages = []
for page in page_iter(CORPUS_PATH):
    work_id = page.get("work_id")
    if work_id in metadata_index:
        text = page.get("text", "").strip()
        if text:
            sample_pages.append(text)
    if len(sample_pages) >= 5:  # grab only 1 page for testing
        break
print(sample_pages)

    # Run OCR correction and print comparisons
for i, raw_text in enumerate(sample_pages, 1):
    corrected = correct_text(raw_text[:1000], tokenizer, model, device)  # truncate to ~1000 chars for demo
    print(f"\n=== Page {i} ===")
    print("Noisy text:\n", raw_text[:500], "...\n")  # print first 500 chars for readability
    print("Corrected text:\n", corrected[:500], "...\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['To the Right excellent and most honorable Ladie, the Ladie Marie, Countesse of Pembroke.\nVOi, pia nympha, tuum, quem tolse la morte, Philippū,\nAEdentem llenas coelestis melle palabras.\nItalicum lumen, flowre of Fraunce, splendor Iberus,\nItalicus Tasso, French Salust, Boscan Iberus,\n〈 in non-Latin alphabet 〉 Virgil, 〈 in non-Latin alphabet 〉,\nGreekish Homer, tanto lati iunguntur 〈 in non-Latin alphabet 〉.\nYour Honors most affectionate. Abraham Fraunce.', "Boscan 3. Booke.\nLos altares delante estauan puestos,\nArdiendo encima d'ellos toda Arabia.\n\n\nCap. 5. Of the Metonymia of the adiunct.\nA Metonymia of the adiunct, is, when by the adiunct we expresse the subiect. So the names of vertues & vices are vsed for vertuous or vicious men: the signe for the thing which it doth signifie: the adiūct of the time for the things subiect, &c.\nHomer. N. Il. \n〈 in non-Latin alphabet 〉.\n\nSo in \n6. Odyss.\n〈 in non-Latin alphabet 〉.\n\nfor Alcinous himself.\nV. Ae. 1. \nQuis genus Ae

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



=== Page 1 ===
Noisy text:
 To the Right excellent and most honorable Ladie, the Ladie Marie, Countesse of Pembroke.
VOi, pia nympha, tuum, quem tolse la morte, Philippū,
AEdentem llenas coelestis melle palabras.
Italicum lumen, flowre of Fraunce, splendor Iberus,
Italicus Tasso, French Salust, Boscan Iberus,
〈 in non-Latin alphabet 〉 Virgil, 〈 in non-Latin alphabet 〉,
Greekish Homer, tanto lati iunguntur 〈 in non-Latin alphabet 〉.
Your Honors most affectionate. Abraham Fraunce. ...

Corrected text:
 To the Right excellent and most honorable Lady, the Lady Marie, Countess of Pembroke.
VOi, pia nympha, tuum, quem tolse la morte, Philippū,
Aedentem llenas coelestis melle palabras.
Italicum lumen, flowre of France, splendor Iberus,
Italicus Tasso, French Salust, Boscan Iberus,
È in non-Latin alphabet ̉ Virgil,
Greekish Homer, tanto lati iunguntur ̉,
Greekish Homer, tanto lati iunguntur ̉,
Greekish Homer, tanto lati iunguntur �,
Greekish Homer, tanto lati iunguntur �,
Greekish Homer, ta

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



=== Page 2 ===
Noisy text:
 Boscan 3. Booke.
Los altares delante estauan puestos,
Ardiendo encima d'ellos toda Arabia.


Cap. 5. Of the Metonymia of the adiunct.
A Metonymia of the adiunct, is, when by the adiunct we expresse the subiect. So the names of vertues & vices are vsed for vertuous or vicious men: the signe for the thing which it doth signifie: the adiūct of the time for the things subiect, &c.
Homer. N. Il. 
〈 in non-Latin alphabet 〉.

So in 
6. Odyss.
〈 in non-Latin alphabet 〉.

for Alcinous himself.
V. Ae. 1. ...

Corrected text:
 Boscan 3. Booke.
Los altares delante estauan puestos,
Ardiendo encima d'ellos toda Arabia. ...



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



=== Page 3 ===
Noisy text:
 〈 in non-Latin alphabet 〉.

〈 in non-Latin alphabet 〉.
This is the Prosopopoeia of Peleus, which is thus left off;
〈 in non-Latin alphabet 〉.
Virgil. Aeneid. 1. Of Aeneas.
—& dictis moerentia pectora mulcet.
O socij (neque enim ignari sumus antemalorum)
O passi grauiora! dabit Deus his quoque finem.
Vos & Scyllaeam rabiem, penitus{que} sonantes
Accestis scopulos: vos & Cyclopea saxa
Experti: reuocate animos, moestùmque timorem
Mittite: forsan & haec olim meminisse iuuabit.
Per varios casus, per  ...

Corrected text:
 È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.

È in non-Latin alphabet È.



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



=== Page 4 ===
Noisy text:
 Et bello Iliacos fateor perijsse penates:
Pro quo, sisceleria tanta est iniuria nostra,
Spargite me influctus, vastóque immergite ponto.
Si pereo, hominum manibus perijsse iuuabit.
Dixerat, & genua amplexus genibisque volutans
Harebat.

Sir Philip Sydney 1. Of Musidorus clad in shepheards weedes.
She might percetue a farre off one cōming towards her in the apparaile of a shepheard, with his armes hanging down, going a kinde of languishing pace, with his eyes sometimes cast vp to heauen, as thou ...

Corrected text:
 Et bello Iliacos fateor perijsse penates:
Pro quo, sisceleria tanta est iniuria nostra,
Spargite me influctus, vastóque immergite ponto.
Si pereo, hominum manibus perijsse iuuabit.
Dixerat, & genua amplexus genibisque volutans
Harebat.

Sir Philip Sydney 1. Of Musidorus clad in shepherds weedes.
She might percetue a farre off one coming towards her in the apparaile of a shepherhead, with his arms hanging down, going a kind of languishing pace, 