<a href="https://colab.research.google.com/github/lorenamilian/AllstateML/blob/main/Welcome_To_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!unzip -q /content/tenx-train-baselines.zip -d .
%cd tenx-train

replace ./tenx-train/README.md? [y]es, [n]o, [A]ll, [N]one, [r]ename: /content/tenx-train


In [1]:
%cd /content/tenx-train
!ls

/content/tenx-train
configs        models.py    reports	      train_baseline.py
data_utils.py  __pycache__  requirements.txt  utils.py
emissions.csv  README.md    tenx-train


In [2]:
!pip install -r requirements.txt --no-deps -q


In [10]:
%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,expandable_segments:True
import torch, gc
gc.collect(); torch.cuda.empty_cache()

env: PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,expandable_segments:True


In [11]:
open("data_utils.py","w").write("""
from datasets import load_dataset
from transformers import GPT2TokenizerFast
import torch

def load_tokenizer(name="gpt2", seq_len=1024):
    tok = GPT2TokenizerFast.from_pretrained(name)
    tok.model_max_length = seq_len
    return tok

def load_and_tokenize(dataset: str, split: str, tok, num_proc=2, cfg_name: str=None):
    if cfg_name:
        ds = load_dataset(dataset, cfg_name, split=split)
    else:
        ds = load_dataset(dataset, split=split)
    def enc(e): return {"ids": tok(e["text"], add_special_tokens=False)["input_ids"]}
    return ds.map(enc, remove_columns=ds.column_names, num_proc=num_proc)

def pack_sequences(token_ds, seq_len=1024):
    buf, out = [], []
    for ex in token_ds:
        buf.extend(ex["ids"])
        while len(buf) >= seq_len:
            out.append(buf[:seq_len]); buf = buf[seq_len:]
    return out

class PackedDataset(torch.utils.data.Dataset):
    def __init__(self, seqs): self.seqs=seqs
    def __len__(self): return len(self.seqs)
    def __getitem__(self, i): return torch.tensor(self.seqs[i], dtype=torch.long)
""")
print("✅ Patched data_utils.py")

✅ Patched data_utils.py


In [12]:
import re
code = open("train_baseline.py").read()

code = re.sub(
    r'ds_train\s*=\s*load_and_tokenize\(.*\)',
    'ds_train = load_and_tokenize(data_cfg["dataset"], split=data_cfg.get("train_split","train"), tok=tok, cfg_name=data_cfg.get("dataset_config"))',
    code, count=1
)
code = re.sub(
    r'ds_val\s*=\s*load_and_tokenize\(.*\)',
    'ds_val   = load_and_tokenize(data_cfg["dataset"], split=data_cfg.get("val_split","validation"), tok=tok, cfg_name=data_cfg.get("dataset_config"))',
    code, count=1
)

open("train_baseline.py","w").write(code)
print("✅ Patched train_baseline.py")

✅ Patched train_baseline.py


In [13]:
import re

path = "models.py"
code = open(path).read()

# Replace the whole SDPA_Attn class definition with the fixed version
patched = re.sub(
    r"class SDPA_Attn\(nn\.Module\):[\s\S]*?class",   # match until the next class
    """
class SDPA_Attn(nn.Module):
    def __init__(self, d_model, n_heads, rope=False):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.qkv = nn.Linear(d_model, 3 * d_model, bias=False)
        self.proj = nn.Linear(d_model, d_model, bias=False)
        self.rope = rope

    def forward(self, x):
        B, T, C = x.shape
        qkv = self.qkv(x)
        q, k, v = qkv.chunk(3, dim=-1)
        q = q.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_heads, self.head_dim).transpose(1, 2)

        if self.rope:
            q, k = apply_rope(q, k)

        # (B,H,T,D) → (B,T,H,D)
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        # causal attention without explicit mask
        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)

        # (B,T,H,D) → (B,T,C)
        out = out.transpose(1, 2).contiguous().view(B, T, C)
        return self.proj(out)

class""",
    code,
    count=1,
    flags=re.MULTILINE
)

open(path, "w").write(patched)
print("✅ Patched models.py to use causal SDPA without attn_mask")

✅ Patched models.py to use causal SDPA without attn_mask


In [14]:
from pathlib import Path

yaml_text = """model:
  variant: modern
  n_layers: 4
  d_model: 320
  n_heads: 5
  vocab_size: 50257
  seq_len: 256

train:
  precision: fp16
  optimizer: adamw
  lr: 3.0e-4
  betas: [0.9, 0.95]
  weight_decay: 0.1
  warmup_steps: 200
  global_batch_tokens: 16384   # e.g., 64×256
  max_tokens: 800_000

data:
  dataset: wikitext
  dataset_config: wikitext-2-raw-v1
  train_split: train
  val_split: validation
  tokenizer: gpt2

logging:
  log_interval: 50
  eval_interval: 400

system:
  seed: 1337
"""
Path("configs/gold_colab_free.yaml").write_text(yaml_text)
print("✅ wrote smaller gold_colab_free.yaml")

✅ wrote smaller gold_colab_free.yaml


In [15]:
import re

path = "models.py"
code = open(path).read()

# add checkpoint import
if "from torch.utils.checkpoint import checkpoint" not in code:
    code = code.replace(
        "import torch, torch.nn as nn, torch.nn.functional as F",
        "import torch, torch.nn as nn, torch.nn.functional as F\nfrom torch.utils.checkpoint import checkpoint"
    )

# wrap block calls with checkpoint
code = re.sub(r"for b in self.blocks:\n\s*x = b\(x\)",
              "for b in self.blocks:\n            x = checkpoint(b, x)",
              code)

open(path,"w").write(code)
print("✅ Enabled gradient checkpointing in models.py")

✅ Enabled gradient checkpointing in models.py


In [18]:
!python train_baseline.py --config configs/gold_colab_free.yaml

Device: cuda
  self.gen = func(*args, **kwds)
Loading & tokenizing...
Map (num_proc=2):   0% 0/36718 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (324 > 256). Running this sequence through the model will result in indexing errors
Map (num_proc=2):   0% 84/36718 [00:00<02:27, 247.70 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (269 > 256). Running this sequence through the model will result in indexing errors
Map (num_proc=2): 100% 36718/36718 [00:10<00:00, 3553.88 examples/s]
Map (num_proc=2):   0% 0/3760 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (318 > 256). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (297 > 256). Running this sequence through the

In [21]:
def estimate(N_params, T_tokens, eff_tflops=150, gpus=256, power_w=300):
    flops = 6 * N_params * T_tokens
    secs = flops / (eff_tflops * 1e12 * gpus)
    kwh = (power_w * gpus) * secs / 3600 / 1000
    return dict(FLOPs=flops, days=secs/86400, kWh=kwh)

# examples
print(estimate(7e9, 20*7e9, eff_tflops=150, gpus=256, power_w=300))
print(estimate(70e9, 20*70e9, eff_tflops=150, gpus=1024, power_w=350))

{'FLOPs': 5.88e+21, 'days': 1.7722800925925926, 'kWh': 3266.6666666666665}
{'FLOPs': 5.88e+23, 'days': 44.30700231481482, 'kWh': 381111.1111111111}


In [19]:
# 0) CUDA allocator: reduce fragmentation
%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,expandable_segments:True
import torch, gc, re
gc.collect(); torch.cuda.empty_cache()

# 1) Enable gradient checkpointing in both models (big memory saver)
path = "models.py"
code = open(path).read()
if "from torch.utils.checkpoint import checkpoint" not in code:
    code = code.replace(
        "import torch, torch.nn as nn, torch.nn.functional as F",
        "import torch, torch.nn as nn, torch.nn.functional as F\nfrom torch.utils.checkpoint import checkpoint"
    )
code = re.sub(r"for b in self.blocks:\n\s*x = b\(x\)",
              "for b in self.blocks:\n            x = checkpoint(b, x)",
              code)
open(path, "w").write(code)
print("✅ Enabled gradient checkpointing")

# 2) Make DataLoaders lighter (avoid extra RAM/VRAM pressure)
p = "train_baseline.py"
s = open(p).read()
s = s.replace(
    "torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True)",
    "torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=0, pin_memory=False)"
)
s = s.replace(
    "torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=False, drop_last=True)",
    "torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=False, drop_last=True, num_workers=0, pin_memory=False)"
)
open(p, "w").write(s)
print("✅ Patched DataLoaders")

# 3) Write tiny Colab-safe configs for both baselines

from pathlib import Path

gold_tiny = """model:
  variant: modern              # RMSNorm + RoPE + SwiGLU + SDPA
  n_layers: 4
  d_model: 320
  n_heads: 5
  vocab_size: 50257
  seq_len: 256                 # shorter context to cut memory

train:
  precision: fp16              # T4 is happiest with fp16
  optimizer: adamw             # simpler; a tad less VRAM than bnb
  lr: 3.0e-4
  betas: [0.9, 0.95]
  weight_decay: 0.1
  warmup_steps: 200
  global_batch_tokens: 12288   # e.g., 48×256; lower if OOM: 8192 (32×256)
  max_tokens: 600_000

data:
  dataset: wikitext
  dataset_config: wikitext-2-raw-v1
  train_split: train
  val_split: validation
  tokenizer: gpt2

logging:
  log_interval: 50
  eval_interval: 400

system:
  seed: 1337
"""
Path("configs/gold_colab_free.yaml").write_text(gold_tiny)

historic_tiny = """model:
  variant: historic            # LayerNorm + GELU + absolute pos
  n_layers: 4
  d_model: 320
  n_heads: 5
  vocab_size: 50257
  seq_len: 256

train:
  precision: fp16
  optimizer: adamw
  lr: 3.0e-4
  betas: [0.9, 0.95]
  weight_decay: 0.1
  warmup_steps: 200
  global_batch_tokens: 12288   # match gold for fair compare
  max_tokens: 600_000

data:
  dataset: wikitext
  dataset_config: wikitext-2-raw-v1
  train_split: train
  val_split: validation
  tokenizer: gpt2

logging:
  log_interval: 50
  eval_interval: 400

system:
  seed: 1337
"""
Path("configs/historic_colab_free.yaml").write_text(historic_tiny)
print("✅ Wrote tiny configs for gold & historic")

env: PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128,expandable_segments:True
✅ Enabled gradient checkpointing
✅ Patched DataLoaders
✅ Wrote tiny configs for gold & historic


In [20]:
!python train_baseline.py --config configs/historic_colab_free.yaml

Device: cuda
  self.gen = func(*args, **kwds)
Loading & tokenizing...
Packing sequences...
Params: 37,167,360 | Seq len: 256 | Batch (seqs): 48
Optimizer: AdamW8bit (bitsandbytes)
  scaler = GradScaler(enabled=use_amp)
[codecarbon INFO @ 19:03:11] [setup] RAM Tracking...
[codecarbon INFO @ 19:03:11] [setup] CPU Tracking...
 Linux OS detected: Please ensure RAPL files exist at /sys/class/powercap/intel-rapl/subsystem to measure CPU

[codecarbon INFO @ 19:03:12] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.20GHz
[codecarbon INFO @ 19:03:12] [setup] GPU Tracking...
[codecarbon INFO @ 19:03:12] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 19:03:12] The below tracking methods have been set up:
                RAM Tracking Method: RAM power estimation model
                CPU Tracking Method: global constant
                GPU Tracking Method: pynvml
            
[codecarbon INFO @ 19:03:12] >>> Tracker's metadata:
[codecarbon INFO @ 19:03:12]   Platform system: Li