In [1]:
# Keep-alive script (prevents tab idle/disconnect — click every 60s)
from IPython.display import display, Javascript
display(Javascript('''
function ClickConnect(){
  console.log("Keeping Colab alive"); 
  document.querySelector("colab-toolbar-button#connect").click() 
}
setInterval(ClickConnect,60000)
'''))
print("Keep-alive activated — tab clicks every 60s")

<IPython.core.display.Javascript object>

Keep-alive activated — tab clicks every 60s


In [1]:
# First cell: Set env + install (run once)
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'  # fragmentation fix

!pip install torch torchvision matplotlib numpy transformers

# Second cell: The sim code (fixed — no backward error)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.amp
from torch.utils.checkpoint import checkpoint
from transformers import AutoTokenizer
import numpy as np
from contextlib import nullcontext
import math

torch.cuda.empty_cache()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# CONFIG – optimized for no OOM + H100 speed (fast epochs)
triality = 3
dim = 192  # reduced
latent_dim = 8
seq_len = 512  # reduced
batch_size = 16  # reduced
accum_steps = 4  # simulate larger batch
epochs = 20000
lr = 5e-5
use_amp = True
use_checkpoint = True

# Synthetic code proxy (real-like Python tokens + noise/masking)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

codes = []
for b in range(batch_size):
    base = torch.sin(torch.linspace(0, 10*math.pi, seq_len, device=device)) * 100 + 200
    base = base % tokenizer.vocab_size
    code = base.long() + torch.randint(-50, 50, (seq_len,), device=device)
    code = torch.clamp(code, 0, tokenizer.vocab_size - 1)
    codes.append(code)

codes = torch.stack(codes).to(device)

embed = nn.Embedding(tokenizer.vocab_size, dim).to(device)
clean_data = embed(codes)

missing_rate = torch.linspace(0.4, 0.7, batch_size, device=device).view(batch_size, 1, 1)
mask = torch.rand_like(clean_data) < missing_rate
real_data = clean_data.clone()
real_data[mask] = 0

target = codes

# E8 roots – precompute
def get_e8_roots():
    roots = []
    for i in range(8):
        for j in range(i+1, 8):
            for signs in [(1,1), (1,-1), (-1,1), (-1,-1)]:
                v = torch.zeros(8)
                v[i] = signs[0]; v[j] = signs[1]
                roots.append(v); roots.append(-v)
    for signs in range(1 << 8):
        v = torch.tensor([(1 if (signs & (1<<k)) else -1) for k in range(8)], dtype=torch.float32) * 0.5
        if bin(signs).count('1') % 2 == 0:
            roots.append(v); roots.append(-v)
    roots = torch.stack(roots[:240])
    return roots / roots.norm(dim=-1, keepdim=True)

e8_roots = get_e8_roots().to(device)

# Triality Cycle Block (detached pump scalar)
class CodeCycleBlock(nn.Module):
    def __init__(self):
        super().__init__()
        self.proj = nn.Linear(latent_dim, dim // triality, bias=False)
        self.register_buffer('roots', e8_roots)

    def forward(self, x, step):
        pos_emb = self.roots[torch.arange(x.shape[1], device=device) % 240]
        low_dim = self.proj(pos_emb)
        emb = low_dim.repeat(1, triality)
        with torch.no_grad():
            pump_scalar = 0.8 * math.sin(step * 0.006 * 2 * math.pi)
        pump = torch.full((1, x.shape[1], 1), pump_scalar, device=device)
        emb_broadcast = emb.unsqueeze(0)
        x_rot1 = x * (emb_broadcast.cos() + pump)
        x_rot2 = torch.roll(x_rot1, shifts=1, dims=1) * emb_broadcast.sin()
        x_rot3 = torch.roll(x_rot2, shifts=1, dims=1) * emb_broadcast.cos()
        fused = (x_rot1 + x_rot2 + x_rot3) / triality
        return fused

# Dummy cycle for ablation
class DummyCycle(nn.Module):
    def forward(self, x, step=None):
        return x

# Model with ablation support (reduced depth)
class E8CodeFusion(nn.Module):
    def __init__(self, depth=16, use_triality=True):  # reduced
        super().__init__()
        self.use_triality = use_triality
        self.cycle = CodeCycleBlock() if use_triality else DummyCycle()
        num_heads = triality if use_triality else 8
        self.layers = nn.ModuleList([nn.MultiheadAttention(dim, num_heads, batch_first=True) for _ in range(depth)])
        self.norm = nn.LayerNorm(dim)
        self.head = nn.Linear(dim, tokenizer.vocab_size)

    def forward(self, x, step):
        x = self.cycle(x, step)
        for layer in self.layers:
            if use_checkpoint:
                attn, _ = checkpoint(layer, x, x, x, use_reentrant=False)
            else:
                attn, _ = layer(x, x, x)
            x = x + self.norm(attn)
        return self.head(x)

# Models
model = E8CodeFusion(use_triality=True).to(device)
model_ablation = E8CodeFusion(use_triality=False).to(device)

opt = torch.optim.AdamW(model.parameters(), lr=lr)
scaler = torch.amp.GradScaler('cuda') if use_amp else nullcontext()

opt_ablation = torch.optim.AdamW(model_ablation.parameters(), lr=lr)
scaler_ablation = torch.amp.GradScaler('cuda') if use_amp else nullcontext()

loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

loss_hist = []
loss_abl_hist = []

for epoch in range(epochs):
    opt.zero_grad(set_to_none=True)
    opt_ablation.zero_grad(set_to_none=True)

    accum_loss = 0
    accum_loss_abl = 0

    for accum_step in range(accum_steps):
        with torch.amp.autocast(device_type='cuda', dtype=torch.float16) if use_amp else nullcontext():
            logits = model(real_data, epoch)
            loss = loss_fn(logits.view(-1, tokenizer.vocab_size), target.view(-1)) / accum_steps

            logits_abl = model_ablation(real_data, epoch)
            loss_abl = loss_fn(logits_abl.view(-1, tokenizer.vocab_size), target.view(-1)) / accum_steps

        scaler.scale(loss).backward() if use_amp else loss.backward()
        scaler_ablation.scale(loss_abl).backward() if use_amp else loss_abl.backward()

        accum_loss += loss.item() * accum_steps
        accum_loss_abl += loss_abl.item() * accum_steps

    scaler.unscale_(opt) if use_amp else None
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1e6)
    scaler.step(opt) if use_amp else opt.step()
    scaler.update() if use_amp else None

    scaler_ablation.unscale_(opt_ablation) if use_amp else None
    torch.nn.utils.clip_grad_norm_(model_ablation.parameters(), 1e6)
    scaler_ablation.step(opt_ablation) if use_amp else opt_ablation.step()
    scaler_ablation.update() if use_amp else None

    loss_hist.append(accum_loss)
    loss_abl_hist.append(accum_loss_abl)

    if epoch % 500 == 0:
        print(f"Epoch {epoch} | Triality Loss {accum_loss:.6f} | Ablation Loss {accum_loss_abl:.6f}")

# Final Sigma Test
triality_mean = np.mean(loss_hist)
abl_mean = np.mean(loss_abl_hist)
std = np.std(loss_hist + loss_abl_hist)
sigma = (abl_mean - triality_mean) / std if std > 0 else 0

print(f"Final Sigma (Triality vs Ablation): {sigma:.2f} (higher = triality advantage)")

print("Sim complete — epochs + sigma test done")

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m26.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Using device: cuda


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.54 GiB. GPU 0 has a total capacity of 79.18 GiB of which 1.45 GiB is free. Process 565 has 1.53 GiB memory in use. Process 1825 has 1.53 GiB memory in use. Process 6071 has 17.26 GiB memory in use. Process 7871 has 10.62 GiB memory in use. Process 8777 has 1.53 GiB memory in use. Process 8996 has 912.00 MiB memory in use. Process 8744 has 1.53 GiB memory in use. Process 9642 has 1.53 GiB memory in use. Process 9860 has 37.11 GiB memory in use. Process 10046 has 1.53 GiB memory in use. Including non-PyTorch memory, this process has 2.62 GiB memory in use. Of the allocated memory 1.93 GiB is allocated by PyTorch, and 28.99 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)