
# ðŸ“’ Kraken Training on Colab â€” **ALTO** (Makefile-style, subprocess logging)

This version:
- Assumes **ALTO** ground truth (eScriptorium exports)
- Auto-extracts uploaded ZIPs and discovers the roots
- Uses **subprocess.run** for training so you **see full logs** (no `32512` confusion)
- Lets you set learning rates/epochs/batch sizes in one place
- Packs models for download and (optionally) POSTs to **msia.escriptorium.fr**

> In Colab: **Runtime â†’ Change runtime type â†’ GPU** before running.


In [None]:

# 0) GPU check
!nvidia-smi || true


## 1) Install Kraken + deps (Python 3.12 safe)

In [None]:

import sys, subprocess

def pip_install(*pkgs):
    print("pip install", " ".join(pkgs))
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])

KR_PIN = "kraken==5.3.0"  # Colab-friendly default
pip_install(KR_PIN, "torch>=2.1,<3", "cairocffi", "opencv-python", "lxml", "h5py")


In [None]:

# Verify installs and CLI presence
from importlib.metadata import version, PackageNotFoundError
import torch, shutil, sys
def pkg_ver(name):
    try: return version(name)
    except PackageNotFoundError: return "not installed"
print("python:", sys.version.split()[0])
print("kraken:", pkg_ver("kraken"))
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("ketos on PATH:", shutil.which("ketos"))
!ketos --version || true


## 2) Config â€” ALTO fixed, split, and (optional) Drive roots

In [None]:

# ======= CONFIG (ALTO) =======
FORMAT = "alto"  # hard-coded

# Optional: add Drive roots if you also keep ALTO data on Drive
DRIVE_ROOTS = [
    # "/content/drive/MyDrive/my_alto_export_folder"
]

# Validation split fraction
VAL_FRACTION = 0.10
# =============================


### (Optional) Mount Google Drive if you use DRIVE_ROOTS

In [None]:

from google.colab import drive
drive.mount('/content/drive')


## 3) Upload ALTO ZIP(s) from Finder â€” auto-extract & discover roots

In [None]:

import os, io, zipfile, re
from google.colab import files

LOCAL_BASE = "/content/data_alto"
os.makedirs(LOCAL_BASE, exist_ok=True)

print("Upload one or more ALTO export ZIPs. They will be extracted into", LOCAL_BASE)
uploaded = files.upload()

EXTRACTED_ROOTS = []
for name, data in uploaded.items():
    save_path = os.path.join(LOCAL_BASE, name)
    with open(save_path, "wb") as f:
        f.write(data)
    if name.lower().endswith(".zip"):
        base = re.sub(r"\s*\(\d+\)\s*$", "", os.path.splitext(name)[0])
        target_dir = os.path.join(LOCAL_BASE, base)
        os.makedirs(target_dir, exist_ok=True)
        with zipfile.ZipFile(io.BytesIO(data)) as zf:
            zf.extractall(target_dir)
        os.remove(save_path)
        EXTRACTED_ROOTS.append(target_dir)
    else:
        EXTRACTED_ROOTS.append(LOCAL_BASE)

print("Extracted roots:", EXTRACTED_ROOTS if EXTRACTED_ROOTS else ["(none; you can still use DRIVE_ROOTS)"])


## 4) Build merged ALTO XML list + sanity checks

In [None]:

import os, glob, xml.etree.ElementTree as ET

ROOTS = [p for p in (EXTRACTED_ROOTS + DRIVE_ROOTS) if p and os.path.exists(p)]
assert ROOTS, "No existing roots found. Upload a ZIP or set DRIVE_ROOTS."

def find_alto_xmls(root):
    xmls = glob.glob(os.path.join(root, "**", "*.xml"), recursive=True) +            glob.glob(os.path.join(root, "**", "*.XML"), recursive=True)
    out = []
    for xp in xmls:
        try:
            r = ET.parse(xp).getroot()
            if isinstance(r.tag, str) and "alto" in r.tag.lower():
                out.append(xp)
        except ET.ParseError:
            pass
    return sorted(out)

xmls = []
for root in ROOTS:
    found = find_alto_xmls(root)
    print(f"[ALTO] {len(found):>5} XMLs in {root}")
    xmls.extend(found)

print("TOTAL ALTO XML files:", len(xmls))
assert len(xmls) >= 3, "Need at least a few ALTO XML pages."

# Heuristic image presence check (by filename stem match)
import collections
stems = collections.Counter(os.path.splitext(os.path.basename(p))[0] for p in xmls)
img_exts = {".jpg",".jpeg",".tif",".tiff",".png",".jp2",".bmp"}
img_candidates = []
for root in ROOTS:
    for ext in img_exts:
        img_candidates += glob.glob(os.path.join(root, "**", f"*{ext}"), recursive=True)
img_stems = set(os.path.splitext(os.path.basename(p))[0] for p in img_candidates)
missing = [s for s in stems if s not in img_stems]
print("Heuristic: ALTO page stems without matching image file:", len(missing))
print("First few missing stems:", missing[:10])


## 5) Create explicit train/val lists

In [None]:

import random, pathlib
random.seed(42)

random.shuffle(xmls)
n_val = max(1, int(len(xmls)*VAL_FRACTION))
val_xmls = xmls[:n_val]
train_xmls = xmls[n_val:]

pathlib.Path("/content/lists").mkdir(parents=True, exist_ok=True)
open("/content/lists/train.txt","w").write("\n".join(train_xmls))
open("/content/lists/val.txt","w").write("\n".join(val_xmls))

len(train_xmls), len(val_xmls)


## 6) Hyperparameters (+ auto device)

In [None]:

# ======= EDIT ME (Hyperparameters) =======
import torch
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# Recognition
REC_EPOCHS = 30
REC_BATCH  = 8
REC_OPTIM  = "Adam"      # "Adam" or "SGD"
REC_LR     = 0.0001      # change LR here (e.g., 0.00005)
REC_WD     = 1e-5

# Segmentation
SEG_EPOCHS = 20
SEG_BATCH  = 2
SEG_OPTIM  = "Adam"
SEG_LR     = 0.0005      # change LR here
# =========================================


### Helper: run a shell command with full logs

In [None]:

import shutil, subprocess, textwrap

KETOS = shutil.which("ketos") or "ketos"

def run_logged(cmd: str):
    cmd = textwrap.dedent(cmd).strip()
    print(">>> Running:\n", cmd)
    p = subprocess.run(cmd, shell=True, text=True)
    print("<<< Return code:", p.returncode)
    return p.returncode


## 7) Target: `train_seg` â€” Segmentation training (full logs)

In [None]:

cmd = f'''
{KETOS} segtrain -f {FORMAT} @/content/lists/train.txt   -o /content/models/seg   --device {DEVICE}   --epochs {SEG_EPOCHS}   --batch-size {SEG_BATCH}   --optimizer {SEG_OPTIM} --lr {SEG_LR}   --validation @/content/lists/val.txt
'''
rc = run_logged(cmd)
assert rc == 0, "segmentation training failed"


## 8) Target: `train_recog` â€” Recognition training (full logs)

In [None]:

cmd = f'''
{KETOS} train -f {FORMAT} @/content/lists/train.txt   -o /content/models/rec   --device {DEVICE}   --epochs {REC_EPOCHS}   --batch-size {REC_BATCH}   --optimizer {REC_OPTIM} --lr {REC_LR} --weight-decay {REC_WD}   --validation @/content/lists/val.txt
'''
rc = run_logged(cmd)
assert rc == 0, "recognition training failed"


## 9) Target: `eval` â€” Evaluate recognition (CER/WER)

In [None]:

cmd = f'''
{KETOS} test -f {FORMAT} /content/models/rec_best.mlmodel @/content/lists/val.txt
'''
rc = run_logged(cmd)
assert rc == 0, "evaluation failed"


## 10) Package models for download

In [None]:

!cd /content/models && ls -lh && zip -r ../trained_models.zip . && cd /content


## 11) Optional: Upload to msia.escriptorium.fr via API

In [None]:

# UI upload (My Models â†’ Upload) is simplest.
# This cell shows how to POST a model programmatically if your instance supports it.
MSIA_URL   = "https://msia.escriptorium.fr"
API_TOKEN  = "PASTE_YOUR_TOKEN_HERE"   # keep secret; or leave blank and use UI upload
MODEL_PATH = "/content/models/rec_best.mlmodel"   # or seg_best.mlmodel
MODEL_NAME = "rec_best"

if API_TOKEN and API_TOKEN != "PASTE_YOUR_TOKEN_HERE":
    import requests
    headers = {"Authorization": f"Token {API_TOKEN}"}
    files = {"file": (MODEL_NAME + ".mlmodel", open(MODEL_PATH, "rb"), "application/octet-stream")}
    data  = {"name": MODEL_NAME}
    resp = requests.post(f"{MSIA_URL}/api/models/", headers=headers, files=files, data=data)
    print("Status:", resp.status_code)
    try:
        print(resp.json())
    except Exception:
        print(resp.text[:800])
else:
    print("Skipping API upload. Use UI upload or paste a valid API token.")
