# Deterministic Codec — Lightweight Round-Trip Tests

This notebook is intentionally **safe/lightweight**:

- Uses a tiny test model (`sshleifer/tiny-gpt2`) by default.
- Uses short texts only.
- Caps decode steps.
- Runs CPU-first tests before any optional cross-device check.
- Keeps cross-device test **disabled by default**.

If you want to be extra safe, keep `RUN_CROSS_DEVICE = False`.

In [None]:
# Safety knobs (edit if needed)
MODEL_ID = "deepseek-ai/deepseek-coder-1.3b-base"   # keep tiny by default for safe tests
MAX_TEXT_CHARS = 128
MAX_DECODE_TOKENS = 256
MAX_TEST_TIMEOUT_SEC = 45

# Extra-strict limits for cross-device smoke test
CROSS_DEVICE_MAX_DECODE_TOKENS = 64
CROSS_DEVICE_TIMEOUT_SEC = 20

RUN_STRICT_CPU = True
RUN_GPU_BEST_EFFORT = True
RUN_CROSS_DEVICE = True  # set False to skip cross-device checks

# Optional device targets for cross-device cell
ENCODE_DEVICE = "cpu"
DECODE_DEVICE = "mps"  # change to "cuda" if needed and available

In [1]:
import os
import time
import signal
import traceback
import importlib
import torch

# Avoid tokenizers fork-parallelism warning / potential deadlock scenarios
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from transformers import AutoTokenizer, AutoModelForCausalLM

import deterministic_runtime
import llm_codec_deterministic
importlib.reload(deterministic_runtime)
importlib.reload(llm_codec_deterministic)

DeterministicCodecConfig = llm_codec_deterministic.DeterministicCodecConfig
DeterministicLLMCodec = llm_codec_deterministic.DeterministicLLMCodec

print("torch:", torch.__version__)
print("cuda available:", torch.cuda.is_available())
print("mps available:", torch.backends.mps.is_available())

[Utils] Success: Loaded C++ Exact Softmax Kernel.
torch: 2.8.0
cuda available: False
mps available: True


In [6]:
# Load tiny model once (safe-ish); catches network/download issues gracefully
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID)
    print(f"Loaded model: {MODEL_ID}")
except Exception as e:
    tokenizer = None
    model = None
    print("Model load failed. Details:")
    print(type(e).__name__, e)

Loaded model: deepseek-ai/deepseek-coder-1.3b-base


In [3]:
def _run_with_timeout(fn, timeout_sec, *args, **kwargs):
    if timeout_sec is None or timeout_sec <= 0:
        return fn(*args, **kwargs)

    if os.name != "posix":
        # Fallback: no hard wall-clock timeout on non-posix systems
        return fn(*args, **kwargs)

    class _NotebookTimeoutError(TimeoutError):
        pass

    def _handler(signum, frame):
        raise _NotebookTimeoutError(f"Timed out after {timeout_sec}s")

    old_handler = signal.getsignal(signal.SIGALRM)
    signal.signal(signal.SIGALRM, _handler)
    signal.setitimer(signal.ITIMER_REAL, float(timeout_sec))
    try:
        return fn(*args, **kwargs)
    finally:
        signal.setitimer(signal.ITIMER_REAL, 0.0)
        signal.signal(signal.SIGALRM, old_handler)


def tiny_roundtrip_test(
    text,
    mode="strict_cpu",
    encode_device="cpu",
    decode_device="cpu",
    max_decode_tokens=None,
    timeout_sec=None,
):
    if tokenizer is None or model is None:
        return {"ok": False, "reason": "model_not_loaded", "mode": mode}

    text = text[:MAX_TEXT_CHARS]
    decode_cap = MAX_DECODE_TOKENS if max_decode_tokens is None else int(max_decode_tokens)
    test_timeout = MAX_TEST_TIMEOUT_SEC if timeout_sec is None else float(timeout_sec)

    cfg = DeterministicCodecConfig(
        determinism_mode=mode,
        precision=32,
        slots=(1 << 20),  # smaller than production for faster/safe smoke tests
        use_legacy_counts=True,
        use_kv_cache=False,
        patch_linear=True,
        patch_rmsnorm=True,
        patch_attention=True,
    )

    started = time.time()
    try:
        encoder = DeterministicLLMCodec(tokenizer, model, device=encode_device, config=cfg)
        encoded = _run_with_timeout(encoder.encode, test_timeout, text)

        decoder = DeterministicLLMCodec(tokenizer, model, device=decode_device, config=cfg)
        decoded = _run_with_timeout(decoder.decode, test_timeout, encoded, decode_cap)

        return {
            "ok": decoded == text,
            "mode": mode,
            "encode_device": encode_device,
            "decode_device": decode_device,
            "input_len": len(text),
            "encoded_bytes": len(encoded),
            "decode_cap": decode_cap,
            "timeout_sec": test_timeout,
            "elapsed_sec": round(time.time() - started, 3),
            "decoded_preview": decoded[:80],
        }
    except Exception as e:
        return {
            "ok": False,
            "mode": mode,
            "encode_device": encode_device,
            "decode_device": decode_device,
            "decode_cap": decode_cap,
            "timeout_sec": test_timeout,
            "error": f"{type(e).__name__}: {e}",
            "traceback": traceback.format_exc().splitlines()[-6:],
            "elapsed_sec": round(time.time() - started, 3),
        }

In [10]:
# Lightweight local tests (CPU only)
texts = [
    "hello",
    "The quick brown fox jumps over the lazy dog.",
    "Deterministic codecs should round-trip exactly.",
]

results = []

if RUN_STRICT_CPU:
    for t in texts:
        results.append(tiny_roundtrip_test(t, mode="strict_cpu", encode_device="cpu", decode_device="cpu"))

if RUN_GPU_BEST_EFFORT:
    # still CPU devices here to keep this cell lightweight and stable
    for t in texts:
        results.append(tiny_roundtrip_test(t, mode="gpu_best_effort", encode_device="cpu", decode_device="cpu"))

for r in results:
    print(r)

all_ok = all(r.get("ok", False) for r in results) if results else False
print("\nALL LOCAL TESTS PASSED:", all_ok)

Deterministic Encode: 100%|██████████| 3/3 [00:01<00:00,  1.91it/s]
Deterministic Encode: 100%|██████████| 14/14 [00:07<00:00,  1.92it/s]
Deterministic Encode: 100%|██████████| 15/15 [00:07<00:00,  1.92it/s]
Deterministic Encode: 100%|██████████| 3/3 [00:00<00:00,  5.93it/s]
Deterministic Encode: 100%|██████████| 14/14 [00:02<00:00,  5.13it/s]
Deterministic Encode: 100%|██████████| 15/15 [00:02<00:00,  5.02it/s]


{'ok': True, 'mode': 'strict_cpu', 'encode_device': 'cpu', 'decode_device': 'cpu', 'input_len': 5, 'encoded_bytes': 8, 'elapsed_sec': 3.031, 'decoded_preview': 'hello'}
{'ok': True, 'mode': 'strict_cpu', 'encode_device': 'cpu', 'decode_device': 'cpu', 'input_len': 44, 'encoded_bytes': 9, 'elapsed_sec': 14.576, 'decoded_preview': 'The quick brown fox jumps over the lazy dog.'}
{'ok': True, 'mode': 'strict_cpu', 'encode_device': 'cpu', 'decode_device': 'cpu', 'input_len': 47, 'encoded_bytes': 14, 'elapsed_sec': 15.616, 'decoded_preview': 'Deterministic codecs should round-trip exactly.'}
{'ok': True, 'mode': 'gpu_best_effort', 'encode_device': 'cpu', 'decode_device': 'cpu', 'input_len': 5, 'encoded_bytes': 8, 'elapsed_sec': 1.026, 'decoded_preview': 'hello'}
{'ok': True, 'mode': 'gpu_best_effort', 'encode_device': 'cpu', 'decode_device': 'cpu', 'input_len': 44, 'encoded_bytes': 9, 'elapsed_sec': 5.483, 'decoded_preview': 'The quick brown fox jumps over the lazy dog.'}
{'ok': True, 'mode'

In [7]:
# Optional cross-device smoke test (guarded with strict decode/time limits)
if RUN_CROSS_DEVICE:
    short_text = "Cross-device tiny smoke test."
    cross_res = tiny_roundtrip_test(
        short_text,
        mode="strict_cpu",
        encode_device=ENCODE_DEVICE,
        decode_device=DECODE_DEVICE,
        max_decode_tokens=CROSS_DEVICE_MAX_DECODE_TOKENS,
        timeout_sec=CROSS_DEVICE_TIMEOUT_SEC,
    )
    print(cross_res)
else:
    print("RUN_CROSS_DEVICE is False; skipped cross-device test.")

Deterministic Encode: 100%|██████████| 9/9 [00:04<00:00,  1.83it/s]


{'ok': False, 'mode': 'strict_cpu', 'encode_device': 'cpu', 'decode_device': 'mps', 'decode_cap': 64, 'timeout_sec': 20.0, 'error': '_NotebookTimeoutError: Timed out after 20.0s', 'traceback': ['  File "/Users/wuxidami/LLM_Text_Compression_IIB_Project/deterministic_runtime.py", line 107, in deterministic_matmul', '    return out.to(device=a.device, dtype=a.dtype)', '           ~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^', '  File "/var/folders/9q/kr6k165j47b7qv4mlqdh4_9r0000gn/T/ipykernel_23277/1214143450.py", line 13, in _handler', '    raise _NotebookTimeoutError(f"Timed out after {timeout_sec}s")', '_run_with_timeout.<locals>._NotebookTimeoutError: Timed out after 20.0s'], 'elapsed_sec': 25.756}


In [8]:
# 2x2 determinism matrix (lightweight + hard limits)
# axes: mode x route

matrix_text = "determinism"

cases = [
    {"mode": "strict_cpu", "encode_device": "cpu", "decode_device": "cpu", "label": "strict cpu->cpu"},
    {"mode": "strict_cpu", "encode_device": "cpu", "decode_device": "mps", "label": "strict cpu->mps"},
    {"mode": "gpu_best_effort", "encode_device": "cpu", "decode_device": "cpu", "label": "best-effort cpu->cpu"},
    {"mode": "gpu_best_effort", "encode_device": "cpu", "decode_device": "mps", "label": "best-effort cpu->mps"},
]

matrix_results = []
for c in cases:
    res = tiny_roundtrip_test(
        matrix_text,
        mode=c["mode"],
        encode_device=c["encode_device"],
        decode_device=c["decode_device"],
        max_decode_tokens=32,
        timeout_sec=12,
    )
    matrix_results.append({
        "case": c["label"],
        "ok": res.get("ok", False),
        "elapsed_sec": res.get("elapsed_sec"),
        "error": res.get("error", ""),
    })

print("MODEL_ID:", MODEL_ID)
print("\n2x2 matrix results:")
for r in matrix_results:
    print(r)

passed = sum(1 for r in matrix_results if r["ok"])
print(f"\npassed={passed}/{len(matrix_results)}")

Deterministic Encode: 100%|██████████| 5/5 [00:03<00:00,  1.43it/s]
Deterministic Encode: 100%|██████████| 5/5 [00:02<00:00,  1.92it/s]
Deterministic Encode: 100%|██████████| 5/5 [00:01<00:00,  4.66it/s]
Deterministic Encode: 100%|██████████| 5/5 [00:00<00:00,  5.27it/s]


MODEL_ID: deepseek-ai/deepseek-coder-1.3b-base

2x2 matrix results:
{'case': 'strict cpu->cpu', 'ok': True, 'elapsed_sec': 7.2, 'error': ''}
{'case': 'strict cpu->mps', 'ok': False, 'elapsed_sec': 14.914, 'error': '_NotebookTimeoutError: Timed out after 12.0s'}
{'case': 'best-effort cpu->cpu', 'ok': True, 'elapsed_sec': 2.517, 'error': ''}
{'case': 'best-effort cpu->mps', 'ok': False, 'elapsed_sec': 6.763, 'error': 'RuntimeError: Decoding exceeded max_decode_tokens=32 before EOF.'}

passed=2/4


In [9]:
# Tiny divergence diagnostic: compare one-step next-token distribution (CPU vs MPS)
# Safe design: single short prompt, single forward pass per device, timeout guarded.

def compare_one_step_cpu_vs_mps(prompt="determinism", topk=10, timeout_sec=12):
    if tokenizer is None or model is None:
        return {"ok": False, "reason": "model_not_loaded"}

    if not torch.backends.mps.is_available():
        return {"ok": False, "reason": "mps_not_available"}

    prompt = prompt[:MAX_TEXT_CHARS]
    token_ids = tokenizer.encode(prompt)
    if len(token_ids) == 0:
        token_ids = [tokenizer.bos_token_id or tokenizer.pad_token_id or 0]

    cfg = DeterministicCodecConfig(
        determinism_mode="strict_cpu",
        precision=32,
        slots=(1 << 20),
        use_legacy_counts=True,
        use_kv_cache=False,
        patch_linear=True,
        patch_rmsnorm=True,
        patch_attention=True,
    )

    def _get_logits_on_device(device):
        codec = DeterministicLLMCodec(tokenizer, model, device=device, config=cfg)
        with deterministic_runtime.deterministic_kernel_context(codec.model, codec.kernel_config):
            logits = codec._logits_for_prefix(token_ids).detach().cpu().to(torch.float64)
        return logits

    started = time.time()
    try:
        logits_cpu = _run_with_timeout(_get_logits_on_device, timeout_sec, "cpu")
        logits_mps = _run_with_timeout(_get_logits_on_device, timeout_sec, "mps")

        probs_cpu = deterministic_runtime.deterministic_softmax(logits_cpu, dim=-1, mode="strict_cpu").cpu().to(torch.float64)
        probs_mps = deterministic_runtime.deterministic_softmax(logits_mps, dim=-1, mode="strict_cpu").cpu().to(torch.float64)

        abs_diff = (probs_cpu - probs_mps).abs()
        max_abs_prob_diff = float(abs_diff.max().item())

        top_cpu = torch.topk(probs_cpu, k=topk)
        top_mps = torch.topk(probs_mps, k=topk)

        cpu_top_ids = top_cpu.indices.tolist()
        mps_top_ids = top_mps.indices.tolist()

        overlap = len(set(cpu_top_ids).intersection(set(mps_top_ids)))

        return {
            "ok": True,
            "prompt": prompt,
            "elapsed_sec": round(time.time() - started, 3),
            "max_abs_prob_diff": max_abs_prob_diff,
            "topk": topk,
            "topk_overlap": overlap,
            "cpu_top_tokens": [tokenizer.decode([i]) for i in cpu_top_ids[:5]],
            "mps_top_tokens": [tokenizer.decode([i]) for i in mps_top_ids[:5]],
            "cpu_top_probs": [float(x) for x in top_cpu.values[:5]],
            "mps_top_probs": [float(x) for x in top_mps.values[:5]],
        }
    except Exception as e:
        return {
            "ok": False,
            "error": f"{type(e).__name__}: {e}",
            "elapsed_sec": round(time.time() - started, 3),
            "traceback": traceback.format_exc().splitlines()[-6:],
        }


diag_res = compare_one_step_cpu_vs_mps(prompt="determinism", topk=10, timeout_sec=12)
print(diag_res)

{'ok': True, 'prompt': 'determinism', 'elapsed_sec': 3.911, 'max_abs_prob_diff': 4.090224560854283e-07, 'topk': 10, 'topk_overlap': 10, 'cpu_top_tokens': ['_', '\n', ' =', ':', '('], 'mps_top_tokens': ['_', '\n', ' =', ':', '('], 'cpu_top_probs': [0.11621690173750832, 0.107010945866445, 0.07950255012886673, 0.07632368286833328, 0.03921738896484425], 'mps_top_probs': [0.11621649271505223, 0.10701097745782302, 0.0795025735993173, 0.07632370540033034, 0.039217400542457655]}
