This notebook demonstrates the approach to download the SmolLM2 model of 135M parameters and reverse Engineer to create the model from scratch and train for a custom data.

In [8]:
# ============================================================
#   SmolLM2-135M â€” Reverse Engineering / Architecture Dump
#   Goal: Inspect config, tokenizer, weights, and derive
#         all architecture details for re-implementation.
# ============================================================

!pip install transformers safetensors huggingface-hub accelerate -q
import json, torch
from huggingface_hub import hf_hub_download
from transformers import AutoModelForCausalLM, AutoTokenizer
from safetensors.torch import load_file
from pprint import pprint

In [9]:
REPO = "HuggingFaceTB/SmolLM2-135M"

print("==== Downloading model files ====")
cfg_path = hf_hub_download(REPO, "config.json")
tok_path = hf_hub_download(REPO, "tokenizer.json")
weights_path = hf_hub_download(REPO, "model.safetensors")

print("Paths:")
print("config.json:", cfg_path)
print("tokenizer.json:", tok_path)
print("model.safetensors:", weights_path)

==== Downloading model files ====
Paths:
config.json: /root/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M/snapshots/93efa2f097d58c2a74874c7e644dbc9b0cee75a2/config.json
tokenizer.json: /root/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M/snapshots/93efa2f097d58c2a74874c7e644dbc9b0cee75a2/tokenizer.json
model.safetensors: /root/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM2-135M/snapshots/93efa2f097d58c2a74874c7e644dbc9b0cee75a2/model.safetensors


In [None]:
# 1. Load CONFIG
print("\n======================== CONFIG.JSON ========================")
config = json.load(open(cfg_path))
pprint(config)


{'architectures': ['LlamaForCausalLM'],
 'attention_bias': False,
 'attention_dropout': 0.0,
 'bos_token_id': 0,
 'eos_token_id': 0,
 'hidden_act': 'silu',
 'hidden_size': 576,
 'initializer_range': 0.041666666666666664,
 'intermediate_size': 1536,
 'is_llama_config': True,
 'max_position_embeddings': 8192,
 'model_type': 'llama',
 'num_attention_heads': 9,
 'num_hidden_layers': 30,
 'num_key_value_heads': 3,
 'pretraining_tp': 1,
 'rms_norm_eps': 1e-05,
 'rope_interleaved': False,
 'rope_scaling': None,
 'rope_theta': 100000,
 'tie_word_embeddings': True,
 'torch_dtype': 'bfloat16',
 'transformers_version': '4.40.1',
 'use_cache': True,
 'vocab_size': 49152}


In [11]:
# Extract important config values
d_model       = config.get("hidden_size")
n_layers      = config.get("num_hidden_layers")
n_heads       = config.get("num_attention_heads")
n_kv_heads    = config.get("num_key_value_heads")
intermediate  = config.get("intermediate_size")
vocab_size    = config.get("vocab_size")
max_pos       = config.get("max_position_embeddings")
rope_theta    = config.get("rope_theta")
rope_scaling  = config.get("rope_scaling")
rope_inter    = config.get("rope_interleaved")
norm_eps      = config.get("rms_norm_eps")
activation    = config.get("hidden_act")

print("\n=== Extracted Architecture Summary ===")
print(f"Model dimension (d_model):          {d_model}")
print(f"Number of layers:                    {n_layers}")
print(f"Attention heads:                     {n_heads}")
print(f"Key/Value heads:                     {n_kv_heads}")
print(f"Intermediate (FFN hidden):           {intermediate}")
print(f"Vocabulary size:                     {vocab_size}")
print(f"Max positions:                       {max_pos}")
print(f"Activation:                           {activation}")
print(f"RMSNorm eps:                          {norm_eps}")
print(f"RoPE theta:                           {rope_theta}")
print(f"RoPE scaling:                         {rope_scaling}")
print(f"RoPE interleaved:                     {rope_inter}")

# Derived shapes
print("\n=== Derived Attention Dimensions ===")
head_dim = d_model // n_heads
print(f"Head dimension:                      {head_dim}")
print(f"QKV total dim:                       {d_model + d_model + (head_dim * n_kv_heads)} (subject to weight inspection)")



=== Extracted Architecture Summary ===
Model dimension (d_model):          576
Number of layers:                    30
Attention heads:                     9
Key/Value heads:                     3
Intermediate (FFN hidden):           1536
Vocabulary size:                     49152
Max positions:                       8192
Activation:                           silu
RMSNorm eps:                          1e-05
RoPE theta:                           100000
RoPE scaling:                         None
RoPE interleaved:                     False

=== Derived Attention Dimensions ===
Head dimension:                      64
QKV total dim:                       1344 (subject to weight inspection)


In [None]:
# 2. Load TOKENIZER
print("\n======================== TOKENIZER ==========================")
tokenizer = AutoTokenizer.from_pretrained(REPO)
print("Tokenizer vocab size:", len(tokenizer))
print("Tokenizer special tokens:", tokenizer.special_tokens_map)


Tokenizer vocab size: 49152
Tokenizer special tokens: {'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '<reponame>', '<file_sep>', '<filename>', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}


In [None]:
import re
from collections import defaultdict

# 3. Inspect WEIGHTS (fixed)
print("\n======================== WEIGHT KEYS ========================")
state = load_file(weights_path)   # assume this returns your checkpoint dict
all_keys = list(state.keys())

print("Total tensors:", len(all_keys))
print("\nFirst 300 keys:")
for k in all_keys[:300]:
    v = state[k]
    shape = getattr(v, "shape", None)
    print(k, shape)

# Group weights by layer (robustly find the token 'layers' then next part)
print("\n======================== LAYER GROUPING ========================")
layers = defaultdict(list)
pat = re.compile(r'layers\.(\d+)')  # fallback regex

for k in all_keys:
    if "layers." in k:
        parts = k.split('.')
        layer_id = None
        # try to find the 'layers' token and take the next element
        try:
            idx = parts.index('layers')
            if idx + 1 < len(parts):
                layer_id = parts[idx + 1]
        except ValueError:
            layer_id = None

        # fallback to regex if the above didn't find a numeric id
        if (layer_id is None) or (not str(layer_id).isdigit()):
            m = pat.search(k)
            if m:
                layer_id = m.group(1)

        # as final fallback, keep a sentinel group
        if not layer_id:
            layer_id = "__unknown__"

        layers[str(layer_id)].append(k)

print("Detected layer groups:", len(layers))
numeric_layer_ids = sorted([lid for lid in layers.keys() if lid.isdigit()], key=lambda x: int(x))
non_numeric_ids = sorted([lid for lid in layers.keys() if not lid.isdigit()])
print("Numeric layer count:", len(numeric_layer_ids))
print("Sample layer IDs:", numeric_layer_ids[:10], " ...")
if non_numeric_ids:
    print("Non-numeric groups (sample):", non_numeric_ids[:10])

# Print structure of each layer (numeric layers first)
print("\n======================== LAYER STRUCTURE DUMP ========================")
to_show = numeric_layer_ids[:5] if numeric_layer_ids else list(layers.keys())[:5]
for lid in to_show:
    print(f"\n----- Layer {lid} -----")
    for k in sorted(layers[lid]):
        v = state.get(k)
        shape = getattr(v, "shape", None)
        dtype = getattr(v, "dtype", None)
        print(k, "->", shape, dtype)
print("\n(Full layer dump printed for first 5 numeric layers. Modify slice to print all.)")



Total tensors: 272

First 300 keys:
model.embed_tokens.weight torch.Size([49152, 576])
model.layers.0.input_layernorm.weight torch.Size([576])
model.layers.0.mlp.down_proj.weight torch.Size([576, 1536])
model.layers.0.mlp.gate_proj.weight torch.Size([1536, 576])
model.layers.0.mlp.up_proj.weight torch.Size([1536, 576])
model.layers.0.post_attention_layernorm.weight torch.Size([576])
model.layers.0.self_attn.k_proj.weight torch.Size([192, 576])
model.layers.0.self_attn.o_proj.weight torch.Size([576, 576])
model.layers.0.self_attn.q_proj.weight torch.Size([576, 576])
model.layers.0.self_attn.v_proj.weight torch.Size([192, 576])
model.layers.1.input_layernorm.weight torch.Size([576])
model.layers.1.mlp.down_proj.weight torch.Size([576, 1536])
model.layers.1.mlp.gate_proj.weight torch.Size([1536, 576])
model.layers.1.mlp.up_proj.weight torch.Size([1536, 576])
model.layers.1.post_attention_layernorm.weight torch.Size([576])
model.layers.1.self_attn.k_proj.weight torch.Size([192, 576])
mode

In [None]:
# 4. Load HF model and print its architecture summary
print("\n======================== TRANSFORMERS MODEL SUMMARY ========================")
model = AutoModelForCausalLM.from_pretrained(
    REPO,
    torch_dtype=torch.bfloat16,
    device_map="cpu"
)

print(model)

# Print submodules with depth
print("\n======================== MODULE TREE ========================")
for name, module in model.named_modules():
    print(name, ":", module.__class__.__name__)

print("\nDone. You now have:")
print(" - All config")
print(" - Tokenizer details")
print(" - Weight key â†’ shape mapping")
print(" - First 5 layer structure")
print(" - Full transformers architecture tree")
print("we can use these to write an identical model.py implementation.")


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((576,), eps=1e-05)
    (rotary_emb): Ll

From above all the info, here is the analysis:

ðŸ”¹Architectural Hyperparameters
===============================
hidden_size = 576
num_hidden_layers = 30
num_attention_heads = 9
num_key_value_heads = 3
intermediate_size = 1536
vocab_size = 49152
rms_norm_eps = 1e-6
activation = silu
rope_theta
rope_scaling
rope_interleaved


ðŸ”¹Weight Shapes
=============
model.layers.0.self_attn.q_proj.weight  (576, 576)
model.layers.0.self_attn.k_proj.weight  (576, 192)
model.layers.0.self_attn.v_proj.weight  (576, 192)
model.layers.0.self_attn.o_proj.weight  (576, 576)
model.layers.0.mlp.gate_proj.weight     (1536, 576)

These shapes tell you:
head_dim = 576 / 9 = 64
kv_dim = 3 Ã— 64 = 192
MLP dimension matches config (1536)
RoPE used instead of positional embeddings
RMSNorm everywhere

ðŸ”¹Layer structure
=================
model.embed_tokens
model.layers.0.attn_norm
model.layers.0.attn
model.layers.0.ffn_norm
model.layers.0.mlp
model.norm
model.lm_head