# Semiseparable 10B Parameter Feasibility (Colab analysis)

Estimates param counts and memory, plus meta-device shape check. Does NOT train 10B on Colab.

In [None]:
import os, sys, subprocess, math, json, torch
from types import SimpleNamespace

REPO_URL = "https://github.com/neko-jpg/Project-ResNet-BK-An-O-N-Language-Model-Architecture.git"
REPO_DIR = "Project-ResNet-BK-An-O-N-Language-Model-Architecture"
if not os.path.exists(REPO_DIR):
    subprocess.run(["git", "clone", REPO_URL], check=True)
os.chdir(REPO_DIR)
if os.getcwd() not in sys.path:
    sys.path.append(os.getcwd())

from src.models.resnet_bk import LanguageModel as ResNetBK
from src.models.mamba_baseline import MambaLM, create_mamba_from_resnetbk_config

Estimators

In [None]:
def param_count_resnetbk(vocab_size, d_model, n_layers, n_seq):
    model = ResNetBK(vocab_size=vocab_size, d_model=d_model, n_layers=n_layers, n_seq=n_seq, num_experts=4, top_k=1, dropout_p=0.1, use_scattering_router=False, use_birman_schwinger=False)
    return sum(p.numel() for p in model.parameters())

def estimate_activation_memory(batch_size, seq_length, d_model, bytes_per_elem=2):
    tokens = batch_size * seq_length
    return tokens * d_model * bytes_per_elem

def estimate_total_memory(params, bytes_per_param=2, activation_bytes=0):
    return params * bytes_per_param + activation_bytes

Compute example sizes

In [None]:
vocab_size = 50000
configs = [
    {"name": "baseline_small", "d_model": 256, "n_layers": 6, "n_seq": 2048},
    {"name": "mid", "d_model": 512, "n_layers": 16, "n_seq": 8192},
    {"name": "target_10b", "d_model": 2048, "n_layers": 48, "n_seq": 32768},
]
estimates = []
for cfg in configs:
    params = param_count_resnetbk(vocab_size, cfg["d_model"], cfg["n_layers"], cfg["n_seq"])
    act_mem = estimate_activation_memory(batch_size=1, seq_length=cfg["n_seq"], d_model=cfg["d_model"], bytes_per_elem=2)
    total_mem = estimate_total_memory(params, bytes_per_param=2, activation_bytes=act_mem)
    estimates.append({"config": cfg, "params": params, "activation_bytes": act_mem, "total_bytes_fp16": total_mem})
print(json.dumps(estimates, indent=2))

Meta-device shape check

In [None]:
with torch.device("meta"):
    cfg = SimpleNamespace(vocab_size=50000, d_model=2048, n_layers=48, n_seq=32768, num_experts=4, top_k=1, dropout_p=0.1, use_scattering_router=False, use_birman_schwinger=False)
    model_meta = ResNetBK(vocab_size=cfg.vocab_size, d_model=cfg.d_model, n_layers=cfg.n_layers, n_seq=cfg.n_seq, num_experts=cfg.num_experts, top_k=cfg.top_k, dropout_p=cfg.dropout_p, use_scattering_router=cfg.use_scattering_router, use_birman_schwinger=cfg.use_birman_schwinger)
    print("Meta model constructed (no real memory)")

What would be needed on Colab

In [None]:
requirements = {
    "mixed_precision": "fp16 or bf16 mandatory",
    "activation_checkpointing": True,
    "gradient_accumulation": True,
    "offload": "ZeRO/FSDP-style sharding not available on stock Colab",
    "multi_gpu": "Colab free tier is single T4; 10B would need multi-GPU or heavy offload",
    "long_context": "1M tokens would require chunking/state saving; not feasible on single T4",
}
print(json.dumps(requirements, indent=2))