<a href="https://colab.research.google.com/github/mahb97/Wakeifier/blob/main/Wake2vec_heartbeat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wake2Vec Heartbeat (Resume from 300)
Read-only monitor + safety mirror. Uses latest local run in `/content/runs/t4_*` and mirrors the newest full checkpoint to Drive.


In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
print("Drive mounted.")

Mounted at /content/drive
Drive mounted.


Resolve active RUN

In [2]:
import pathlib, time

RUN_ID = None

LOCAL_ROOT = pathlib.Path("/content/runs")
DRIVE_ROOT = pathlib.Path("/content/drive/MyDrive/wake2vec")
def latest_run(root):
    if not root.exists(): return None
    runs = []
    for p in root.glob("t4_*"):
        try: runs.append((p.stat().st_mtime, p))
        except FileNotFoundError: pass
    return max(runs, key=lambda x: x[0])[1] if runs else None

LOCAL_RUN  = (LOCAL_ROOT/RUN_ID) if RUN_ID else latest_run(LOCAL_ROOT)
DRIVE_RUN  = (DRIVE_ROOT/"runs"/RUN_ID) if RUN_ID else latest_run(DRIVE_ROOT/"runs")
RUN = LOCAL_RUN or DRIVE_RUN
assert RUN is not None, "No local or Drive t4_* run found."

print("Watching:", RUN, "| mtime:", time.ctime(RUN.stat().st_mtime))
SENTRY = DRIVE_ROOT/"sentry_backups"/RUN.name
SENTRY.mkdir(parents=True, exist_ok=True)

Watching: /content/drive/MyDrive/wake2vec/runs/t4_1762376560 | mtime: Wed Nov  5 21:02:40 2025


loss tail

In [3]:
import json

mlog = RUN/"metrics"/"phase1_loss_log.json"
if mlog.exists():
    logs = json.loads(mlog.read_text())
    tail = logs[-5:]
    print("[LOSS stream] last:", [(d["step"], round(float(d["loss"]),4)) for d in tail])
else:
    # Fallback
    cks = sorted(RUN.glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[-1]))
    if cks and (cks[-1]/"trainer_state.json").exists():
        state = json.loads((cks[-1]/"trainer_state.json").read_text())
        tail = [(d["step"], d["loss"]) for d in state.get("log_history", []) if "loss" in d][-5:]
        print("[LOSS state ] last:", tail if tail else "—")
    else:
        print("[LOSS] no logs yet")

[LOSS stream] last: [(350, 5.4883), (400, 5.3082), (450, 4.7776), (500, 4.0891), (550, 3.2604)]


Eval tail

In [4]:
import json
cks = sorted(RUN.glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[-1]))
if cks:
    p = cks[-1]/"trainer_state.json"
    if p.exists():
        state = json.loads(p.read_text())
        evals = [d for d in state.get("log_history", []) if "eval_loss" in d][-3:]
        print("[EVAL] tail:", evals if evals else "—")
    else:
        print("[EVAL] none yet (next at 400/600/800...)")
else:
    print("[EVAL] no checkpoints yet")

[EVAL] tail: [{'epoch': 3.50989010989011, 'eval_loss': 6.237724304199219, 'eval_runtime': 13.5689, 'eval_samples_per_second': 3.538, 'eval_steps_per_second': 0.442, 'step': 200}, {'epoch': 7.0175824175824175, 'eval_loss': 6.416950225830078, 'eval_runtime': 13.5887, 'eval_samples_per_second': 3.532, 'eval_steps_per_second': 0.442, 'step': 400}, {'epoch': 10.527472527472527, 'eval_loss': 7.096441268920898, 'eval_runtime': 13.6439, 'eval_samples_per_second': 3.518, 'eval_steps_per_second': 0.44, 'step': 600}]


checkpoint audit

In [5]:
import shutil, time

def latest_full_ckpt(root):
    cks = sorted(root.glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[-1]), reverse=True)
    for ck in cks:
        if (ck/"model.safetensors").exists() or (ck/"pytorch_model.bin").exists():
            return ck
    return None

src = latest_full_ckpt(RUN)
if src is None:
    print("[SENTRY] No full checkpoint yet; wait for next save.")
else:
    dst = SENTRY/src.name
    if not dst.exists():
        shutil.copytree(src, dst)
        print(f"[SENTRY] mirrored {src.name} (mtime {time.ctime(src.stat().st_mtime)})")
    else:
        print("[SENTRY] already has", src.name)
    # Mirror metrics
    msrc = RUN/"metrics"; mdst = SENTRY/"metrics"; mdst.mkdir(parents=True, exist_ok=True)
    copied = 0
    if msrc.exists():
        for f in msrc.glob("*.json"):
            shutil.copy2(f, mdst/f.name); copied += 1
    print(f"[SENTRY] metrics mirrored ({copied} files)")

[SENTRY] already has checkpoint-300
[SENTRY] metrics mirrored (1 files)


embedding snapshot quick view

In [6]:
SNAPS_DIR = DRIVE_ROOT/"emb_snaps"/RUN.name
if SNAPS_DIR.exists():
    snaps = sorted(SNAPS_DIR.glob("emb_step*.pt"))
    print(f"[SNAPS] count={len(snaps)}  latest=", snaps[-1].name if snaps else "—")
    hb = SNAPS_DIR/"heartbeat.json"
    if hb.exists(): print("[SNAPS] heartbeat:", hb.read_text())
else:
    print("[SNAPS] none yet (first at step 350 if every 50)")

[SNAPS] count=7  latest= emb_step0650.pt
[SNAPS] heartbeat: {
  "step": 650,
  "rows": 32000,
  "dim": 2048,
  "ts": 1762727444.428657
}


opt light sync

In [7]:
import os, time, pathlib
touch = RUN/"_heartbeat.touch"
touch.write_text(str(time.time()))
os.sync()
print("[SYNC] touched + sync hinted →", touch)

[SYNC] touched + sync hinted → /content/drive/MyDrive/wake2vec/runs/t4_1762376560/_heartbeat.touch
