<a href="https://colab.research.google.com/github/mahb97/Wake2vec/blob/main/Wake2vec_heartbeat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Wake2Vec Heartbeat (Resume from 300)
Read-only monitor + safety mirror. Uses latest local run in `/content/runs/t4_*` and mirrors the newest full checkpoint to Drive.


In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
print("Drive mounted.")

Mounted at /content/drive
Drive mounted.


Resolve active RUN

In [14]:
import pathlib, time

RUN_ID = None

LOCAL_ROOT = pathlib.Path("/content/runs")
DRIVE_ROOT = pathlib.Path("/content/drive/MyDrive/wake2vec")
def latest_run(root):
    if not root.exists(): return None
    runs = []
    for p in root.glob("t4_*"):
        try: runs.append((p.stat().st_mtime, p))
        except FileNotFoundError: pass
    return max(runs, key=lambda x: x[0])[1] if runs else None

LOCAL_RUN  = (LOCAL_ROOT/RUN_ID) if RUN_ID else latest_run(LOCAL_ROOT)
DRIVE_RUN  = (DRIVE_ROOT/"runs"/RUN_ID) if RUN_ID else latest_run(DRIVE_ROOT/"runs")
RUN = LOCAL_RUN or DRIVE_RUN
assert RUN is not None, "No local or Drive t4_* run found."

print("Watching:", RUN, "| mtime:", time.ctime(RUN.stat().st_mtime))
SENTRY = DRIVE_ROOT/"sentry_backups"/RUN.name
SENTRY.mkdir(parents=True, exist_ok=True)

Watching: /content/drive/MyDrive/wake2vec/runs/t4_1762376560 | mtime: Sun Nov  9 22:54:00 2025


loss tail

In [16]:
import json

mlog = RUN/"metrics"/"phase1_loss_log.json"
if mlog.exists():
    logs = json.loads(mlog.read_text())
    tail = logs[-5:]
    print("[LOSS stream] last:", [(d["step"], round(float(d["loss"]),4)) for d in tail])
else:
    # Fallback
    cks = sorted(RUN.glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[-1]))
    if cks and (cks[-1]/"trainer_state.json").exists():
        state = json.loads((cks[-1]/"trainer_state.json").read_text())
        tail = [(d["step"], d["loss"]) for d in state.get("log_history", []) if "loss" in d][-5:]
        print("[LOSS state ] last:", tail if tail else "—")
    else:
        print("[LOSS] no logs yet")

[LOSS stream] last: [(350, 5.4883), (400, 5.3082), (450, 4.7776), (500, 4.0891), (550, 3.2604)]


Eval tail

In [17]:
import json
cks = sorted(RUN.glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[-1]))
if cks:
    p = cks[-1]/"trainer_state.json"
    if p.exists():
        state = json.loads(p.read_text())
        evals = [d for d in state.get("log_history", []) if "eval_loss" in d][-3:]
        print("[EVAL] tail:", evals if evals else "—")
    else:
        print("[EVAL] none yet (next at 400/600/800...)")
else:
    print("[EVAL] no checkpoints yet")

[EVAL] tail: [{'epoch': 3.50989010989011, 'eval_loss': 6.237724304199219, 'eval_runtime': 13.5689, 'eval_samples_per_second': 3.538, 'eval_steps_per_second': 0.442, 'step': 200}, {'epoch': 7.0175824175824175, 'eval_loss': 6.416950225830078, 'eval_runtime': 13.5887, 'eval_samples_per_second': 3.532, 'eval_steps_per_second': 0.442, 'step': 400}, {'epoch': 10.527472527472527, 'eval_loss': 7.096441268920898, 'eval_runtime': 13.6439, 'eval_samples_per_second': 3.518, 'eval_steps_per_second': 0.44, 'step': 600}]


checkpoint audit

In [18]:
import shutil, time

def latest_full_ckpt(root):
    cks = sorted(root.glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[-1]), reverse=True)
    for ck in cks:
        if (ck/"model.safetensors").exists() or (ck/"pytorch_model.bin").exists():
            return ck
    return None

src = latest_full_ckpt(RUN)
if src is None:
    print("[SENTRY] No full checkpoint yet; wait for next save.")
else:
    dst = SENTRY/src.name
    if not dst.exists():
        shutil.copytree(src, dst)
        print(f"[SENTRY] mirrored {src.name} (mtime {time.ctime(src.stat().st_mtime)})")
    else:
        print("[SENTRY] already has", src.name)
    # Mirror metrics
    msrc = RUN/"metrics"; mdst = SENTRY/"metrics"; mdst.mkdir(parents=True, exist_ok=True)
    copied = 0
    if msrc.exists():
        for f in msrc.glob("*.json"):
            shutil.copy2(f, mdst/f.name); copied += 1
    print(f"[SENTRY] metrics mirrored ({copied} files)")

[SENTRY] already has checkpoint-300
[SENTRY] metrics mirrored (1 files)


embedding snapshot quick view

In [19]:
SNAPS_DIR = DRIVE_ROOT/"emb_snaps"/RUN.name
if SNAPS_DIR.exists():
    snaps = sorted(SNAPS_DIR.glob("emb_step*.pt"))
    print(f"[SNAPS] count={len(snaps)}  latest=", snaps[-1].name if snaps else "—")
    hb = SNAPS_DIR/"heartbeat.json"
    if hb.exists(): print("[SNAPS] heartbeat:", hb.read_text())
else:
    print("[SNAPS] none yet (first at step 350 if every 50)")

[SNAPS] count=9  latest= emb_step0750.pt
[SNAPS] heartbeat: {
  "step": 750,
  "rows": 32000,
  "dim": 2048,
  "ts": 1762729811.8068688
}


In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


opt light sync

In [20]:
import os, time, pathlib
touch = RUN/"_heartbeat.touch"
touch.write_text(str(time.time()))
os.sync()
print("[SYNC] touched + sync hinted →", touch)

[SYNC] touched + sync hinted → /content/drive/MyDrive/wake2vec/runs/t4_1762376560/_heartbeat.touch


In [3]:
import shutil, pathlib
RUN = pathlib.Path("/content/drive/MyDrive/wake2vec/runs/t4_1762376560")
SENTRY = pathlib.Path("/content/drive/MyDrive/wake2vec/sentry_backups/t4_1762376560")
SENTRY.mkdir(parents=True, exist_ok=True)
src = RUN/"checkpoint-750"; dst = SENTRY/"checkpoint-750"
if src.exists() and not dst.exists():
    shutil.copytree(src, dst); print("[SENTRY] mirrored checkpoint-750")

In [2]:
import pathlib
RUN = pathlib.Path("/content/drive/MyDrive/wake2vec/runs/t4_1762376560")
ck = RUN/"checkpoint-750"
print("750 exists:", ck.exists(),
      "| weights:", (ck/"model.safetensors").exists() or (ck/"pytorch_model.bin").exists())

750 exists: False | weights: False


In [6]:
# A1) If Drive is mounted, cleanly unmount
try:
    from google.colab import drive
    drive.flush_and_unmount()
    print("[OK] flushed & unmounted")
except Exception as e:
    print("[INFO] flush/unmount skipped:", e)

# A2) Remove stale mountpoint contents (safe in Colab)
import shutil, os
if os.path.exists("/content/drive"):
    shutil.rmtree("/content/drive", ignore_errors=True)
    print("[OK] removed /content/drive")

# A3) Fresh mount to the correct Google account
from google.colab import drive
drive.mount('/content/drive', force_remount=True)   # choose the account with wake2vec

# A4) Sanity check
import pathlib
BASE = pathlib.Path("/content/drive/MyDrive/wake2vec")
print("[BASE exists]", BASE.exists())
print("[BASE contents]", [p.name for p in BASE.iterdir()] if BASE.exists() else "—")

Drive not mounted, so nothing to flush and unmount.
[OK] flushed & unmounted
[OK] removed /content/drive
Mounted at /content/drive
[BASE exists] True
[BASE contents] ['runs', 'adapters', 'reports', 'archives', 'notebooks', 'datasets', 'sentry_backups', 'emb_snaps']


In [7]:
import pathlib

BASE   = pathlib.Path("/content/drive/MyDrive/wake2vec")
RUNS   = BASE/"runs"
SENTRY = BASE/"sentry_backups"
SNAPS  = BASE/"emb_snaps"

def audit(run_root):
    print(f"\n[{run_root}] exists:", run_root.exists())
    if not run_root.exists(): return
    for run in sorted(run_root.glob("t4_*")):
        print(" ", run.name)
        for ck in sorted(run.glob("checkpoint-*"), key=lambda p:int(p.name.split("-")[-1])):
            step = int(ck.name.split("-")[-1]) if ck.name.count("-") else -1
            has_w = (ck/"model.safetensors").exists() or (ck/"pytorch_model.bin").exists()
            print(f"   {ck.name:>20}  weights={int(has_w)}")

audit(RUNS)
audit(SENTRY)

print("\n[SNAPS] exists:", SNAPS.exists())
if SNAPS.exists():
    for r in sorted(SNAPS.glob("t4_*")):
        snaps = sorted(r.glob("emb_step*.pt"))
        print(" ", r.name, "| snaps:", len(snaps), "| latest:", snaps[-1].name if snaps else "—")


[/content/drive/MyDrive/wake2vec/runs] exists: True
  t4_1762100254
  t4_1762104879
  t4_1762105026
  t4_1762113417
  t4_1762375997
  t4_1762376307
  t4_1762376560
         checkpoint-100  weights=1
         checkpoint-200  weights=1
         checkpoint-300  weights=1
         checkpoint-400  weights=0
         checkpoint-500  weights=0
         checkpoint-600  weights=0
         checkpoint-700  weights=0

[/content/drive/MyDrive/wake2vec/sentry_backups] exists: True
  t4_1762376560
         checkpoint-300  weights=0
         checkpoint-400  weights=0
         checkpoint-500  weights=0
         checkpoint-600  weights=0
         checkpoint-700  weights=0

[SNAPS] exists: True
  t4_1762376560 | snaps: 9 | latest: emb_step0750.pt


In [9]:
# replace STEP with 400/500/600/700 etc. and RUN_ID as needed
RUN_ID = "t4_1762376560"
STEP = 700
ck = RUNS/RUN_ID/f"checkpoint-{STEP}"
print(ck)
print("\nFiles:")
for p in sorted(ck.glob("*")):
    print(" -", p.name)

/content/drive/MyDrive/wake2vec/runs/t4_1762376560/checkpoint-700

Files:
 - config.json
 - generation_config.json
 - optimizer.pt
 - rng_state.pth
 - scheduler.pt
 - special_tokens_map.json
 - tokenizer.json
 - tokenizer.model
 - tokenizer_config.json
 - trainer_state.json
 - training_args.bin


In [11]:
# Rebuild a loadable checkpoint-750 from the embedding snapshot
import pathlib, torch, shutil, re
from transformers import AutoTokenizer, AutoModelForCausalLM

BASE   = pathlib.Path("/content/drive/MyDrive/wake2vec")
RUNS   = BASE/"runs"
SENTRY = BASE/"sentry_backups"
SNAPS  = BASE/"emb_snaps"

# Pick RUN_ID from snapshots (since runs may be partial)
RUN_IDS = sorted([p.name for p in SNAPS.glob("t4_*")])
assert RUN_IDS, "No t4_* in emb_snaps — check Drive mount/account."
RUN_ID = RUN_IDS[-1]
print("[RUN_ID]", RUN_ID)

# Find the **last full** base checkpoint ≤ 750 (prefer sentry, then runs)
def full_ckpts(root):
    out = []
    d = root/RUN_ID
    if not d.exists(): return out
    for ck in d.glob("checkpoint-*"):
        step = int(ck.name.split("-")[-1])
        has_w = (ck/"model.safetensors").exists() or (ck/"pytorch_model.bin").exists() \
                or list(ck.glob("model-*-of-*.safetensors")) or list(ck.glob("pytorch_model-*-of-*.bin"))
        if has_w: out.append((step, ck))
    return sorted(out, key=lambda x: x[0], reverse=True)

bases = full_ckpts(SENTRY) + full_ckpts(RUNS)
assert bases, "No base checkpoints with weights found in sentry_backups/ or runs/."
base_step, BASE_CK = next(((s, p) for s, p in bases if s <= 750), bases[-1])
print(f"[BASE] Using {BASE_CK} (step {base_step})")

# Load embedding snapshot @ 750
EMB = SNAPS/RUN_ID/"emb_step0750.pt"
assert EMB.exists(), "emb_step0750.pt not found."
emb = torch.load(EMB, map_location="cpu")

# Load base and inject embeddings; re-tie head
tok = AutoTokenizer.from_pretrained(str(BASE_CK), use_fast=True)
model = AutoModelForCausalLM.from_pretrained(str(BASE_CK), torch_dtype=torch.float32, device_map="cpu")
with torch.no_grad():
    model.get_input_embeddings().weight[:emb.size(0), :].copy_(emb)
    model.get_output_embeddings().weight = model.get_input_embeddings().weight
print("[REBUILD] Injected emb_step0750 into base")

# Save as checkpoint-750-rebuilt (creates weights files)
OUT_RUN    = RUNS/RUN_ID/"checkpoint-750-rebuilt"
OUT_SENTRY = SENTRY/RUN_ID/"checkpoint-750-rebuilt"
for d in (OUT_RUN, OUT_SENTRY):
    if d.exists(): shutil.rmtree(d)
model.save_pretrained(str(OUT_RUN), safe_serialization=True)
tok.save_pretrained(str(OUT_RUN))
shutil.copytree(OUT_RUN, OUT_SENTRY)
print("[SAVED] →", OUT_RUN)
print("[MIRRORED] →", OUT_SENTRY)

[RUN_ID] t4_1762376560
[BASE] Using /content/drive/MyDrive/wake2vec/runs/t4_1762376560/checkpoint-300 (step 300)


`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /content/drive/MyDrive/wake2vec/runs/t4_1762376560/checkpoint-300 and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[REBUILD] Injected emb_step0750 into base
[SAVED] → /content/drive/MyDrive/wake2vec/runs/t4_1762376560/checkpoint-750-rebuilt
[MIRRORED] → /content/drive/MyDrive/wake2vec/sentry_backups/t4_1762376560/checkpoint-750-rebuilt


In [12]:
from pathlib import Path
ck = Path("/content/drive/MyDrive/wake2vec/runs/t4_1762376560/checkpoint-750-rebuilt")
has_w = (ck/"model.safetensors").exists() or (ck/"pytorch_model.bin").exists() \
        or list(ck.glob("model-*-of-*.safetensors")) or list(ck.glob("pytorch_model-*-of-*.bin"))
print("750-rebuilt loadable:", bool(has_w))

750-rebuilt loadable: True
