
# ðŸ“’ Kraken Training on Colab â€” Makefile-Style (msia compatible)

This notebook mirrors a **Makefile-style** workflow for training **Kraken** models:
- Pull data from **Google Drive** and/or **local uploads**
- Train **segmentation** and **recognition** with **custom learning rates**
- Evaluate recognition (CER/WER)
- Package outputs for download
- (Optional) Upload `.mlmodel` directly to your eScriptorium instance (**msia.escriptorium.fr**)

> In Colab: **Runtime â†’ Change runtime type â†’ GPU** before running.


In [None]:

# 0) GPU check
!nvidia-smi || true



## Optional: Detect Kraken version from an existing model
Upload **any working `.mlmodel`** from your instance to pin to a compatible line.  
If detection fails or you skip, we fall back to a Colab-friendly default.


In [None]:

from google.colab import files
up = files.upload()  # you may skip
MODEL_PATH = next(iter(up)) if up else None
MODEL_PATH


In [None]:

# Decide a kraken pin robustly for Colab (Python 3.12 compatible)
import re, sys
detected = None
if MODEL_PATH:
    try:
        import h5py
        with h5py.File(MODEL_PATH, "r") as h:
            for key in ("version","kraken_version","software","generator","creator"):
                if key in h.attrs:
                    val = h.attrs[key]
                    if isinstance(val, (bytes, bytearray)): val = val.decode()
                    m = re.search(r"(\d+\.\d+(?:\.\d+)?)", str(val))
                    if m:
                        detected = m.group(1); break
    except Exception as e:
        print("Model inspection skipped:", e)

if detected:
    major, minor = detected.split('.')[:2]
    KR_PIN = f"kraken=={major}.{minor}.*"
else:
    # Colab often runs Python 3.12; kraken 5.2.* won't install there.
    KR_PIN = "kraken==5.3.0" if sys.version_info >= (3,12) else "kraken==5.2.*"

print("Requested kraken pin:", KR_PIN)


In [None]:

# 1) Install kraken + deps (with safe fallback to 5.3.0 on Py3.12)
import sys, subprocess

def pip_install(*pkgs):
    print("pip install", " ".join(pkgs))
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])

try:
    pip_install(KR_PIN, "torch>=2.1,<3", "cairocffi", "opencv-python", "lxml", "h5py")
except Exception as e:
    print("Primary install failed:", e)
    KR_PIN = "kraken==5.3.0"
    pip_install(KR_PIN, "torch>=2.1,<3", "cairocffi", "opencv-python", "lxml", "h5py")


In [None]:

# Verify installs without relying on kraken.__version__
from importlib.metadata import version, PackageNotFoundError
import shutil, torch, sys

def pkg_ver(name):
    try:
        return version(name)
    except PackageNotFoundError:
        return "not installed"

print("python:", sys.version.split()[0])
print("kraken:", pkg_ver("kraken"))
print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("ketos on PATH:", shutil.which("ketos"))
!ketos --version || true


## Config â€” paths, format, and validation split (Makefile-style)

In [None]:

# ======= CONFIG (EDIT ME) =======
# Kraken input format flag for training: one of {"xml", "alto", "page"}
FORMAT = "xml"

# Google Drive roots (add/remove as you like)
DRIVE_ROOTS = [
    "/content/drive/MyDrive/avestan_gt",   # EDIT to your folder(s)
    # "/content/drive/MyDrive/another_dataset"
]

# Local roots inside Colab (uploads/extracted zips will live here)
LOCAL_ROOTS = [
    "/content/data_local"
]

# Validation split fraction
VAL_FRACTION = 0.10
# ================================


### Mount Google Drive (if using DRIVE_ROOTS)

In [None]:

from google.colab import drive
drive.mount('/content/drive')



### Upload local files/zips (optional)
Upload **ZIPs** (auto-extracted) or loose files. Everything lands under `/content/data_local`.


In [None]:

import os, io, zipfile
from google.colab import files

os.makedirs("/content/data_local", exist_ok=True)
print("Upload zips or files (you can select multiple). Zips are extracted to /content/data_local.")
uploaded = files.upload()
for name, data in uploaded.items():
    path = f"/content/data_local/{name}"
    with open(path, "wb") as f: f.write(data)
    if name.lower().endswith(".zip"):
        with zipfile.ZipFile(io.BytesIO(data)) as zf:
            zf.extractall("/content/data_local")
        os.remove(path)
print("Local root ready at /content/data_local")


## Build merged XML list from all roots + sanity checks

In [None]:

import os, glob, xml.etree.ElementTree as ET

ROOTS = [p for p in (DRIVE_ROOTS + LOCAL_ROOTS) if os.path.exists(p)]
assert ROOTS, "No existing roots found. Check DRIVE_ROOTS/LOCAL_ROOTS paths."

xmls = []
for root in ROOTS:
    found = sorted(glob.glob(os.path.join(root, "**", "*.xml"), recursive=True))
    if found:
        print(f"[OK] {len(found):>5} XMLs in {root}")
        xmls.extend(found)
    else:
        print(f"[..]    0 XMLs in {root} (skipped)")
print("TOTAL XML files:", len(xmls))
assert len(xmls) >= 3, "Need at least a few XML pages."

# Sample check: ensure PAGE/ALTO <Page imageFilename='...'> exists on disk
missing = []
for xp in xmls[:800]:  # sample first 800; remove slice to check all
    try:
        root = ET.parse(xp).getroot()
    except ET.ParseError:
        print("XML parse error:", xp); continue
    for pe in root.findall(".//{*}Page"):
        fn = pe.get("imageFilename")
        if fn:
            img_path = os.path.join(os.path.dirname(xp), fn)
            if not os.path.exists(img_path):
                missing.append((xp, fn)); break

print("Missing referenced images:", len(missing))
if missing[:5]:
    print("First few missing:", missing[:5])


## Create explicit train/val lists (like Makefile prereqs)

In [None]:

import random, pathlib
random.seed(42)

random.shuffle(xmls)
n_val = max(1, int(len(xmls)*VAL_FRACTION))
val_xmls = xmls[:n_val]
train_xmls = xmls[n_val:]

pathlib.Path("/content/lists").mkdir(parents=True, exist_ok=True)
open("/content/lists/train.txt","w").write("\n".join(train_xmls))
open("/content/lists/val.txt","w").write("\n".join(val_xmls))

len(train_xmls), len(val_xmls)


## Hyperparameters (edit once; used by targets below)

In [None]:

# ======= EDIT ME (Hyperparameters) =======
import torch
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
print("Using device:", DEVICE)

# Recognition
REC_EPOCHS = 30
REC_BATCH  = 8
REC_OPTIM  = "Adam"   # "Adam" or "SGD"
REC_LR     = 1e-4     # If your Kraken errors on --lr, use --lrate in the command cell
REC_WD     = 1e-5     # weight decay

# Segmentation
SEG_EPOCHS = 20
SEG_BATCH  = 2
SEG_OPTIM  = "Adam"
SEG_LR     = 5e-4
# =========================================


## Target: `train_seg` â€” Segmentation training

In [None]:

# Create output dir
!mkdir -p /content/models


In [None]:

# ===== train_seg =====
# To adjust learning rate, modify --lr (or --lrate if your Kraken is older).
# To change epochs, modify --epochs.
# To change batch-size, modify --batch-size.
# To continue training from a prior segmentation model, add:
#    --load-model /path/to/seg_base.mlmodel

import shlex, os

seg_cmd = f'''
ketos segtrain -f {shlex.quote(FORMAT)} @/content/lists/train.txt \
  -o /content/models/seg \
  --device {shlex.quote(DEVICE)} \
  --epochs {SEG_EPOCHS} \
  --batch-size {SEG_BATCH} \
  --optimizer {shlex.quote(SEG_OPTIM)} --lr {SEG_LR} \
  --validation @/content/lists/val.txt
'''
print(seg_cmd)
os.system(seg_cmd)


## Target: `train_recog` â€” Recognition training

In [None]:

# ===== train_recog =====
# To adjust learning rate, modify --lr (or --lrate if your Kraken is older).
# To change epochs, modify --epochs.
# To change batch-size, modify --batch-size.
# To continue training from a prior recognizer, add:
#    --load-model /path/to/rec_base.mlmodel

import shlex, os

rec_cmd = f'''
ketos train -f {shlex.quote(FORMAT)} @/content/lists/train.txt \
  -o /content/models/rec \
  --device {shlex.quote(DEVICE)} \
  --epochs {REC_EPOCHS} \
  --batch-size {REC_BATCH} \
  --optimizer {shlex.quote(REC_OPTIM)} --lr {REC_LR} --weight-decay {REC_WD} \
  --validation @/content/lists/val.txt
'''
print(rec_cmd)
os.system(rec_cmd)


## Target: `eval` â€” Evaluate recognition (CER/WER)

In [None]:

# ===== eval =====
# Evaluate the best recognition model on the held-out list:

import shlex, os

eval_cmd = f'''
ketos test -f {shlex.quote(FORMAT)} /content/models/rec_best.mlmodel @/content/lists/val.txt
'''
print(eval_cmd)
os.system(eval_cmd)


## Package models (like `make package`)

In [None]:

!cd /content/models && ls -lh && zip -r ../trained_models.zip . && cd /content


## Optional: Upload to msia.escriptorium.fr via API (like `make upload`)

In [None]:

# UI upload (My Models â†’ Upload) is simplest.
# This cell shows how to POST a model programmatically if your instance supports it.

MSIA_URL   = "https://msia.escriptorium.fr"
API_TOKEN  = "PASTE_YOUR_TOKEN_HERE"   # keep secret; or leave blank and use UI upload
MODEL_PATH = "/content/models/rec_best.mlmodel"   # change to seg_best.mlmodel to upload the segmenter
MODEL_NAME = "rec_best"

if API_TOKEN and API_TOKEN != "PASTE_YOUR_TOKEN_HERE":
    import requests
    headers = {"Authorization": f"Token {API_TOKEN}"}
    files = {"file": (MODEL_NAME + ".mlmodel", open(MODEL_PATH, "rb"), "application/octet-stream")}
    data  = {"name": MODEL_NAME}
    resp = requests.post(f"{MSIA_URL}/api/models/", headers=headers, files=files, data=data)
    print("Status:", resp.status_code)
    try:
        print(resp.json())
    except Exception:
        print(resp.text[:800])
else:
    print("Skipping API upload. Use UI upload or paste a valid API token.")
