
# Kraken + ALTO OCR ‚Äî Colab (Auto attempts, minimal edits)

**You only change one thing ‚Üí `PROJECT_ID`** (the manuscript number).  
The notebook will:
- Install Kraken fast (uses a small pip cache on Drive).
- Ask you to **upload a ZIP** (ALTO XML + images).
- **Auto-build** train/val lists.
- **Auto-detect attempts**:
  - **Attempt 01** (no model yet for this manuscript): it will try to start **from the newest model of another manuscript** if available, otherwise start from scratch.
  - **Attempt >01**: it will **resume from the best model** of the previous attempt and **lower the learning rate** (configurable).

Run cells **top to bottom**.


## 1) Connect Google Drive

In [None]:

from google.colab import drive  # type: ignore
drive.mount('/content/drive')
print("‚úÖ Drive mounted at /content/drive")



## 2) Project Settings (only edit `PROJECT_ID`)

- `PROJECT_ID`: the manuscript number (e.g., `0093`).


In [None]:

#@title üîß Project Settings (edit only PROJECT_ID)
from pathlib import Path

PROJECT_ID = "0093"  #@param {type:"string"}

ROOT_IN_DRIVE = "/content/drive/MyDrive"
# Derived locations ‚Äî no need to edit
MODELS_DIR = f"{ROOT_IN_DRIVE}/kraken_models/{PROJECT_ID}/rec"
PIP_CACHE_DIR = f"{ROOT_IN_DRIVE}/.pip-cache"
LISTS_DIR = "/content/lists"
DATA_DIR = f"/content/data/{PROJECT_ID}"

# Create folders
for p in [MODELS_DIR, PIP_CACHE_DIR, LISTS_DIR, DATA_DIR]:
    Path(p).mkdir(parents=True, exist_ok=True)

TRAIN_LIST = f"{LISTS_DIR}/train.txt"
VAL_LIST   = f"{LISTS_DIR}/val.txt"

# Runtime defaults
CPU_THREADS = 2
DEVICE = "cpu"   # set to "cuda" if you enable T4 GPU in Colab

print("‚úÖ Settings applied")
print("PROJECT_ID:", PROJECT_ID)
print("DATA_DIR:", DATA_DIR)
print("MODELS_DIR:", MODELS_DIR)


## 3) Install Kraken (fast, cached)

In [None]:

import os, subprocess, shlex

os.environ["PIP_CACHE_DIR"] = PIP_CACHE_DIR

def is_importable(pkg: str) -> bool:
    try:
        __import__(pkg)
        return True
    except Exception:
        return False

if is_importable("kraken"):
    import kraken
    try:
        import torch
        torch_v = torch.__version__
    except Exception:
        torch_v = "unknown"
    print(f"‚úÖ Kraken available (version: {getattr(kraken, '__version__', 'unknown')}) | Torch: {torch_v}")
else:
    print("‚è≥ Installing Kraken ...")
    subprocess.run(shlex.split("python -m pip -q install --upgrade pip"), check=True)
    subprocess.run(shlex.split("python -m pip -q install 'kraken[cairo]'"), check=True)
    import kraken, importlib
    importlib.reload(kraken)
    try:
        import torch
        torch_v = torch.__version__
    except Exception:
        torch_v = "unknown"
    print(f"‚úÖ Installed Kraken (version: {getattr(kraken, '__version__', 'unknown')}) | Torch: {torch_v}")



## 4) Upload your ALTO dataset (ZIP)
- ZIP should include ALTO XMLs and their page images.
- We extract to `DATA_DIR` and build train/val lists automatically.


In [None]:

from google.colab import files  # type: ignore
import zipfile, os

print("üì¶ Please select your ZIP...")
uploaded = files.upload()
if not uploaded:
    raise SystemExit("‚ùå No file uploaded.")

zip_name = next(iter(uploaded.keys()))
zip_path = f"/content/{zip_name}"
os.makedirs(DATA_DIR, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(DATA_DIR)

print(f"‚úÖ Extracted into: {DATA_DIR}")
!find "$DATA_DIR" -maxdepth 2 -type f | head -n 20


## 5) Build train/val lists from ALTO

In [None]:

import os
from pathlib import Path
from typing import List, Tuple, Optional
import xml.etree.ElementTree as ET

IMG_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff"}

def _strip_ns(tag: str) -> str:
    return tag.split('}', 1)[1] if '}' in tag else tag

def alto_image_from_xml(xml_path: Path) -> Optional[str]:
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        for el in root.iter():
            if _strip_ns(el.tag) == "fileName":
                if el.text and el.text.strip():
                    return el.text.strip()
    except Exception:
        pass
    return None

def find_image_candidates(root: Path) -> dict:
    images = {}
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            images.setdefault(p.stem, str(p.resolve()))
    return images

def resolve_image_for_alto(xml_path: Path, data_root: Path, images_by_stem: dict) -> Optional[str]:
    fn = alto_image_from_xml(xml_path)
    if fn:
        candidate = (xml_path.parent / fn)
        if candidate.exists():
            return str(candidate.resolve())
        for p in data_root.rglob(Path(fn).name):
            if p.is_file() and p.suffix.lower() in IMG_EXTS:
                return str(p.resolve())
    stem = xml_path.stem
    return images_by_stem.get(stem)

def find_pairs_alto_first(root: str) -> List[Tuple[str, str]]:
    rootp = Path(root)
    images_by_stem = find_image_candidates(rootp)
    pairs: List[Tuple[str, str]] = []
    for xml in rootp.rglob("*.xml"):
        try:
            with open(xml, "r", encoding="utf-8", errors="ignore") as fh:
                head = fh.read(4096)
                if "<alto" not in head:
                    continue
        except Exception:
            continue
        img_path = resolve_image_for_alto(xml, rootp, images_by_stem)
        if img_path:
            pairs.append((img_path, str(xml.resolve())))
    return pairs

def write_list(pairs: List[Tuple[str, str]], out_path: str):
    with open(out_path, "w", encoding="utf-8") as f:
        for img, xml in pairs:
            f.write(f"{img}\t{xml}\n")

pairs = sorted(set(find_pairs_alto_first(DATA_DIR)))
n = len(pairs)
print(f"Found {n} image+ALTO pairs.")

if n < 2:
    raise SystemExit(f"‚ùå Not enough samples in {DATA_DIR}. Found {n}. Check your ZIP structure.")

# 90/10 split
from math import floor
cut = max(1, int(n * 0.9))
train_pairs, val_pairs = pairs[:cut], pairs[cut:]
Path(LISTS_DIR).mkdir(parents=True, exist_ok=True)
write_list(train_pairs, TRAIN_LIST)
write_list(val_pairs,   VAL_LIST)
print(f"‚úÖ Wrote lists ‚Üí {TRAIN_LIST} ({len(train_pairs)}), {VAL_LIST} ({len(val_pairs)})")

print("\nSample train lines:")
print("\n".join(open(TRAIN_LIST, encoding="utf-8").read().splitlines()[:5]))



## 6) Auto-detect attempt and choose base model

- If **no model yet** for this manuscript ‚áí **Attempt 01**.  
  - We try to use the **newest model from another manuscript** as base.
- If there **is** at least one model ‚áí next attempt number (e.g., 02, 03, ‚Ä¶), and we **load the previous attempt‚Äôs best** as base.
- For attempts **>01**, we also set a **lower learning rate** automatically (you can change the values below).


In [None]:

import os, re, glob, time
from pathlib import Path

ALL_MODELS_ROOT = f"{ROOT_IN_DRIVE}/kraken_models"

def list_attempt_models(models_dir: str):
    return sorted(Path(models_dir).glob("attempt_*.mlmodel"))

def next_attempt_id(models_dir: str) -> int:
    attempts = list_attempt_models(models_dir)
    if not attempts:
        return 1
    # extract numeric suffix
    nums = []
    for p in attempts:
        m = re.search(r"attempt_(\d+)\.mlmodel$", p.name)
        if m:
            nums.append(int(m.group(1)))
    return (max(nums) + 1) if nums else 1

def find_previous_attempt_model(models_dir: str, attempt_id: int) -> str or None:
    prev_id = attempt_id - 1
    if prev_id < 1:
        return None
    cand = Path(models_dir) / f"attempt_{prev_id:02d}.mlmodel"
    return str(cand) if cand.exists() else None

def newest_model_from_other_projects(root_dir: str, exclude_project: str) -> str or None:
    pattern = str(Path(root_dir) / "*" / "rec" / "*.mlmodel")
    # newest by mtime
    newest = None
    newest_mtime = -1
    for p in glob.glob(pattern):
        if f"/{exclude_project}/" in p or f"\\{exclude_project}\\" in p:
            continue
        try:
            mtime = os.path.getmtime(p)
            if mtime > newest_mtime:
                newest_mtime = mtime
                newest = p
        except Exception:
            pass
    return newest

ATTEMPT_ID = next_attempt_id(MODELS_DIR)
OUT_MODEL = str(Path(MODELS_DIR) / f"attempt_{ATTEMPT_ID:02d}.mlmodel")

# Decide base model
BASE_MODEL = None
if ATTEMPT_ID == 1:
    BASE_MODEL = newest_model_from_other_projects(ALL_MODELS_ROOT, PROJECT_ID)
    if BASE_MODEL:
        print(f"‚ÑπÔ∏è Attempt {ATTEMPT_ID:02d}: using cross-manuscript base model ‚Üí {BASE_MODEL}")
    else:
        print(f"‚ÑπÔ∏è Attempt {ATTEMPT_ID:02d}: no cross-manuscript base found. Training from scratch.")
else:
    BASE_MODEL = find_previous_attempt_model(MODELS_DIR, ATTEMPT_ID)
    if BASE_MODEL:
        print(f"‚ÑπÔ∏è Attempt {ATTEMPT_ID:02d}: resuming from previous attempt ‚Üí {BASE_MODEL}")
    else:
        print(f"‚ÑπÔ∏è Attempt {ATTEMPT_ID:02d}: previous attempt model not found; training from scratch.")

# Learning rate policy
LR_FOR_ATTEMPT_1 = 1e-3     # used if we *explicitly* set LR for attempt 1 (we may omit)
LR_FOR_LATER      = 1e-4     # smaller LR for attempts > 1

# Decide LR
AUTO_LR = None
if ATTEMPT_ID > 1:
    AUTO_LR = LR_FOR_LATER
else:
    # For attempt 1 we can omit LR (ketos default) or set LR_FOR_ATTEMPT_1
    AUTO_LR = None  # keep None to use ketos default; set to LR_FOR_ATTEMPT_1 to enforce

print(f"ATTEMPT_ID: {ATTEMPT_ID:02d}") 
print(f"OUT_MODEL: {OUT_MODEL}") 
print(f"BASE_MODEL: {BASE_MODEL}") 
print(f"AUTO_LR: {AUTO_LR}")


## 7) Train recognition model

In [None]:

import shlex, subprocess
from pathlib import Path

cmd = [
    "ketos","train",
    "-o", OUT_MODEL,
    "--workers", str(int(CPU_THREADS)),
    "--device", DEVICE,
    "-f", "alto",
    TRAIN_LIST, VAL_LIST
]

if BASE_MODEL:
    cmd += ["--load", BASE_MODEL]

if AUTO_LR is not None:
    # Add an explicit LR only when we decided to change it
    cmd += ["--lr", str(AUTO_LR)]

print("Running:", " ".join(shlex.quote(x) for x in cmd))
result = subprocess.run(cmd, text=True)

if result.returncode == 0:
    print(f"‚úÖ Training finished. Model at: {OUT_MODEL}")
else:
    raise SystemExit("‚ùå Training failed. Check the logs above.")


## 8) Evaluate (CER/WER)

In [None]:

import shlex, subprocess

cmd = ["ketos", "test", "-f", "alto", "-m", OUT_MODEL, VAL_LIST]
print("Running:", " ".join(shlex.quote(x) for x in cmd))
res = subprocess.run(cmd, text=True)

if res.returncode == 0:
    print("‚úÖ Evaluation completed. See metrics above.")
else:
    raise SystemExit("‚ùå Evaluation failed. Check the logs above.")



## Notes

- **Auto-detected base model for Attempt 01:** we pick the newest `.mlmodel` from any other manuscript in `MyDrive/kraken_models/*/rec`. If none is found, we start from scratch.
- **Resume on later attempts:** we load the previous attempt‚Äôs `.mlmodel` automatically and (by default) **lower the LR** (`1e-4`). Adjust `LR_FOR_LATER` in the attempt cell if desired.
- **Only edit `PROJECT_ID`:** all other paths derive from it.
- **GPU:** enable T4 GPU in Colab (Runtime ‚Üí Change runtime type) and set `DEVICE = "cuda"` in the settings cell.
