# Semiseparable 10B Parameter Feasibility (Colab analysis)

Estimates param counts and memory, plus meta-device shape check. Does NOT train 10B on Colab.

In [ ]:
# Stable install: Torch 2.3.1 cu121 + deps pinned for transformers/sklearn/scipy
# If you just upgraded pip/setuptools, restart runtime once before running below.
!pip install --upgrade --no-cache-dir pip setuptools wheel ninja packaging cmake jedi
# Pin numpy/scipy/sklearn to avoid ABI issues on Colab (Py3.12)
!pip install --force-reinstall --no-cache-dir numpy==2.1.4 scipy==1.13.1 scikit-learn==1.5.2 transformers==4.43.4 datasets==2.20.0 matplotlib==3.8.4
!pip install --force-reinstall --no-cache-dir torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 --index-url https://download.pytorch.org/whl/cu121
%env FORCE_CUDA=1
%env MAX_JOBS=4
%env TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6"
!pip install --no-cache-dir --no-build-isolation mamba-ssm==2.2.2 --extra-index-url https://download.pytorch.org/whl/cu121


In [None]:
# Repo setup (clone if needed, add to sys.path)
import os, sys, subprocess, pathlib
REPO_URL = 'https://github.com/neko-jpg/Project-ResNet-BK-An-O-N-Language-Model-Architecture.git'
REPO_DIR = 'Project-ResNet-BK-An-O-N-Language-Model-Architecture'
cwd = pathlib.Path.cwd()
candidates = [cwd, cwd.parent, cwd / REPO_DIR, cwd.parent / REPO_DIR]
root = next((p for p in candidates if (p / 'src').exists()), None)
if root is None:
    root = cwd / REPO_DIR
    if not root.exists():
        subprocess.run(['git', 'clone', REPO_URL, str(root)], check=True)
if root != pathlib.Path.cwd():
    os.chdir(root)
root_str = str(pathlib.Path.cwd())
if root_str not in sys.path:
    sys.path.insert(0, root_str)
print('PWD:', root_str)


In [None]:
import json
import torch
from types import SimpleNamespace
from src.models.resnet_bk import LanguageModel as ResNetBK


Estimators

In [None]:
from src.models.resnet_bk import LanguageModel as ResNetBK

def param_count_resnetbk(vocab_size, d_model, n_layers, n_seq):
    model = ResNetBK(vocab_size=vocab_size, d_model=d_model, n_layers=n_layers, n_seq=n_seq, num_experts=4, top_k=1, dropout_p=0.1, use_scattering_router=False, use_birman_schwinger=False)
    return sum(p.numel() for p in model.parameters())

def estimate_activation_memory(batch_size, seq_length, d_model, bytes_per_elem=2):
    tokens = batch_size * seq_length
    return tokens * d_model * bytes_per_elem

def estimate_total_memory(params, bytes_per_param=2, activation_bytes=0):
    return params * bytes_per_param + activation_bytes

Compute example sizes

In [None]:
vocab_size = 50000
configs = [
    {"name": "baseline_small", "d_model": 256, "n_layers": 6, "n_seq": 2048},
    {"name": "mid", "d_model": 512, "n_layers": 16, "n_seq": 8192},
    {"name": "target_10b", "d_model": 2048, "n_layers": 48, "n_seq": 32768},
]
estimates = []
for cfg in configs:
    params = param_count_resnetbk(vocab_size, cfg["d_model"], cfg["n_layers"], cfg["n_seq"])
    act_mem = estimate_activation_memory(batch_size=1, seq_length=cfg["n_seq"], d_model=cfg["d_model"], bytes_per_elem=2)
    total_mem = estimate_total_memory(params, bytes_per_param=2, activation_bytes=act_mem)
    estimates.append({"config": cfg, "params": params, "activation_bytes": act_mem, "total_bytes_fp16": total_mem})
print(json.dumps(estimates, indent=2))

Meta-device shape check

In [None]:
from src.models.resnet_bk import LanguageModel as ResNetBK

with torch.device("meta"):
    cfg = SimpleNamespace(vocab_size=50000, d_model=2048, n_layers=48, n_seq=32768, num_experts=4, top_k=1, dropout_p=0.1, use_scattering_router=False, use_birman_schwinger=False)
    model_meta = ResNetBK(vocab_size=cfg.vocab_size, d_model=cfg.d_model, n_layers=cfg.n_layers, n_seq=cfg.n_seq, num_experts=cfg.num_experts, top_k=cfg.top_k, dropout_p=cfg.dropout_p, use_scattering_router=cfg.use_scattering_router, use_birman_schwinger=cfg.use_birman_schwinger)
    print("Meta model constructed (no real memory)")

What would be needed on Colab

In [None]:
requirements = {
    "mixed_precision": "fp16 or bf16 mandatory",
    "activation_checkpointing": True,
    "gradient_accumulation": True,
    "offload": "ZeRO/FSDP-style sharding not available on stock Colab",
    "multi_gpu": "Colab free tier is single T4; 10B would need multi-GPU or heavy offload",
    "long_context": "1M tokens would require chunking/state saving; not feasible on single T4",
}
print(json.dumps(requirements, indent=2))