# Load Bielik-1.5B-v3.0-Instruct with Layer Names

This notebook loads the Bielik-1.5B-v3.0-Instruct model from HuggingFace and displays all available layer names.

In [20]:
from pathlib import Path
from mi_crow.language_model.language_model import LanguageModel
from mi_crow.store.local_store import LocalStore

MODEL_ID = "speakleash/Bielik-1.5B-v3.0-Instruct"
STORE_DIR = Path("./store")

In [21]:
print("üì• Loading Bielik model from HuggingFace...")
print(f"Model: {MODEL_ID}\n")

store = LocalStore(base_path=STORE_DIR)
lm = LanguageModel.from_huggingface(MODEL_ID, store=store)

print(f"‚úÖ Model loaded successfully!")
print(f"üì± Device: {lm.context.device}")
print(f"üÜî Model ID: {lm.model_id}")
print(f"üìÅ Store location: {lm.context.store.base_path}")
print()

üì• Loading Bielik model from HuggingFace...
Model: speakleash/Bielik-1.5B-v3.0-Instruct

‚úÖ Model loaded successfully!
üì± Device: cpu
üÜî Model ID: speakleash_Bielik-1.5B-v3.0-Instruct
üìÅ Store location: store



In [22]:
layer_names = lm.layers.get_layer_names()
print(f"üîç Found {len(layer_names)} layers in the model\n")
print("=" * 80)
print("All Layer Names:")
print("=" * 80)
for i, name in enumerate(layer_names):
    print(f"{i:4d}: {name}")
print("=" * 80)

üîç Found 422 layers in the model

All Layer Names:
   0: llamaforcausallm_model
   1: llamaforcausallm_model_embed_tokens
   2: llamaforcausallm_model_layers
   3: llamaforcausallm_model_layers_0
   4: llamaforcausallm_model_layers_0_self_attn
   5: llamaforcausallm_model_layers_0_self_attn_q_proj
   6: llamaforcausallm_model_layers_0_self_attn_k_proj
   7: llamaforcausallm_model_layers_0_self_attn_v_proj
   8: llamaforcausallm_model_layers_0_self_attn_o_proj
   9: llamaforcausallm_model_layers_0_mlp
  10: llamaforcausallm_model_layers_0_mlp_gate_proj
  11: llamaforcausallm_model_layers_0_mlp_up_proj
  12: llamaforcausallm_model_layers_0_mlp_down_proj
  13: llamaforcausallm_model_layers_0_mlp_act_fn
  14: llamaforcausallm_model_layers_0_input_layernorm
  15: llamaforcausallm_model_layers_0_post_attention_layernorm
  16: llamaforcausallm_model_layers_1
  17: llamaforcausallm_model_layers_1_self_attn
  18: llamaforcausallm_model_layers_1_self_attn_q_proj
  19: llamaforcausallm_model_la

In [23]:
print("\nüìä Layer Information Summary:")
print(f"   Total layers: {len(layer_names)}")
print(f"   First layer: {layer_names[0] if layer_names else 'N/A'}")
print(f"   Last layer: {layer_names[-1] if layer_names else 'N/A'}")

transformer_layers = [name for name in layer_names if 'transformer' in name.lower() or 'layer' in name.lower() or 'h_' in name.lower()]
if transformer_layers:
    print(f"\nüéØ Found {len(transformer_layers)} transformer-related layers:")
    for layer in transformer_layers[:10]:
        print(f"   - {layer}")
    if len(transformer_layers) > 10:
        print(f"   ... and {len(transformer_layers) - 10} more")


üìä Layer Information Summary:
   Total layers: 422
   First layer: llamaforcausallm_model
   Last layer: llamaforcausallm_lm_head

üéØ Found 417 transformer-related layers:
   - llamaforcausallm_model_layers
   - llamaforcausallm_model_layers_0
   - llamaforcausallm_model_layers_0_self_attn
   - llamaforcausallm_model_layers_0_self_attn_q_proj
   - llamaforcausallm_model_layers_0_self_attn_k_proj
   - llamaforcausallm_model_layers_0_self_attn_v_proj
   - llamaforcausallm_model_layers_0_self_attn_o_proj
   - llamaforcausallm_model_layers_0_mlp
   - llamaforcausallm_model_layers_0_mlp_gate_proj
   - llamaforcausallm_model_layers_0_mlp_up_proj
   ... and 407 more


In [24]:
lm.layers.print_layer_names()

llamaforcausallm_model: No weight
llamaforcausallm_model_embed_tokens: torch.Size([32000, 1536])
llamaforcausallm_model_layers: No weight
llamaforcausallm_model_layers_0: No weight
llamaforcausallm_model_layers_0_self_attn: No weight
llamaforcausallm_model_layers_0_self_attn_q_proj: torch.Size([1536, 1536])
llamaforcausallm_model_layers_0_self_attn_k_proj: torch.Size([256, 1536])
llamaforcausallm_model_layers_0_self_attn_v_proj: torch.Size([256, 1536])
llamaforcausallm_model_layers_0_self_attn_o_proj: torch.Size([1536, 1536])
llamaforcausallm_model_layers_0_mlp: No weight
llamaforcausallm_model_layers_0_mlp_gate_proj: torch.Size([8960, 1536])
llamaforcausallm_model_layers_0_mlp_up_proj: torch.Size([8960, 1536])
llamaforcausallm_model_layers_0_mlp_down_proj: torch.Size([1536, 8960])
llamaforcausallm_model_layers_0_mlp_act_fn: No weight
llamaforcausallm_model_layers_0_input_layernorm: torch.Size([1536])
llamaforcausallm_model_layers_0_post_attention_layernorm: torch.Size([1536])
llamafor

In [25]:
print("\nüíæ Layer names saved to variable 'layer_names'")
print(f"   Access with: layer_names")
print(f"   Example: layer_names[0] = '{layer_names[0] if layer_names else 'N/A'}'")
print(f"\n‚úÖ Model ready for use!")


üíæ Layer names saved to variable 'layer_names'
   Access with: layer_names
   Example: layer_names[0] = 'llamaforcausallm_model'

‚úÖ Model ready for use!


## Filter: Post-Attention LayerNorm Layers

Filter and display only the `_post_attention_layernorm` layers (resid_mid - residual stream after attention, before MLP).

In [26]:
post_attention_layers = [name for name in layer_names if "_post_attention_layernorm" in name]

print(f"üéØ Found {len(post_attention_layers)} post-attention layernorm layers:\n")
print("=" * 80)
for i, name in enumerate(post_attention_layers):
    layer = lm.layers.name_to_layer[name]
    weight_shape = getattr(layer, 'weight', None)
    weight_info = weight_shape.shape if weight_shape is not None else 'No weight'
    print(f"{i:3d}: {name}")
    print(f"     Weight shape: {weight_info}")
print("=" * 80)

üéØ Found 32 post-attention layernorm layers:

  0: llamaforcausallm_model_layers_0_post_attention_layernorm
     Weight shape: torch.Size([1536])
  1: llamaforcausallm_model_layers_1_post_attention_layernorm
     Weight shape: torch.Size([1536])
  2: llamaforcausallm_model_layers_2_post_attention_layernorm
     Weight shape: torch.Size([1536])
  3: llamaforcausallm_model_layers_3_post_attention_layernorm
     Weight shape: torch.Size([1536])
  4: llamaforcausallm_model_layers_4_post_attention_layernorm
     Weight shape: torch.Size([1536])
  5: llamaforcausallm_model_layers_5_post_attention_layernorm
     Weight shape: torch.Size([1536])
  6: llamaforcausallm_model_layers_6_post_attention_layernorm
     Weight shape: torch.Size([1536])
  7: llamaforcausallm_model_layers_7_post_attention_layernorm
     Weight shape: torch.Size([1536])
  8: llamaforcausallm_model_layers_8_post_attention_layernorm
     Weight shape: torch.Size([1536])
  9: llamaforcausallm_model_layers_9_post_attention_

In [27]:
print("\nüíæ Post-attention layernorm layer names saved to variable 'post_attention_layers'")
print(f"   Total count: {len(post_attention_layers)}")
print(f"   Example: post_attention_layers[0] = '{post_attention_layers[0] if post_attention_layers else 'N/A'}'")
print(f"   Example: post_attention_layers[-1] = '{post_attention_layers[-1] if post_attention_layers else 'N/A'}'")


üíæ Post-attention layernorm layer names saved to variable 'post_attention_layers'
   Total count: 32
   Example: post_attention_layers[0] = 'llamaforcausallm_model_layers_0_post_attention_layernorm'
   Example: post_attention_layers[-1] = 'llamaforcausallm_model_layers_31_post_attention_layernorm'


## Layer Selection Recommendations (Based on Research)

Based on recent SAE research (LLaMA-Scope, Kissane et al., Olson et al.):

**Key Findings:**
- **Residual stream (post-MLP) is optimal** - highest explained variance, lowest ŒîLM loss
- **Mid-to-upper layers (15-24) are best** - balance expressivity and abstraction
- **Avoid very early layers (0-6)** - too little structure
- **Avoid very late layers (30-31)** - layer 31 showed anomalous behavior
- **Validated layers**: 7, 15, 23 showed robust performance

**For Bielik-1.5B (32 layers):**
- **Recommended range**: Layers 15-24 (mid-to-upper third)
- **Also validated**: Layers 7, 15, 23
- **Avoid**: Very early (0-6) and very late (30-31)

In [28]:
import re

def extract_layer_number(layer_name):
    """Extract layer number from layer name."""
    match = re.search(r'_layers_(\d+)_post_attention_layernorm', layer_name)
    return int(match.group(1)) if match else None

layer_numbers = {}
for name in post_attention_layers:
    layer_num = extract_layer_number(name)
    if layer_num is not None:
        layer_numbers[layer_num] = name

total_layers = len(post_attention_layers)
print(f"üìä Layer Analysis for Bielik-1.5B ({total_layers} layers total)\n")
print("=" * 80)

recommended_layers = [num for num in layer_numbers.keys() if 15 <= num <= 24]
validated_layers = [7, 15, 23]
avoid_early = [num for num in layer_numbers.keys() if 0 <= num <= 6]
avoid_late = [num for num in layer_numbers.keys() if 30 <= num <= 31]

print("‚úÖ RECOMMENDED (Mid-to-upper layers, 15-24):")
for num in sorted(recommended_layers):
    print(f"   Layer {num:2d}: {layer_numbers[num]}")
print(f"   Total: {len(recommended_layers)} layers\n")

print("‚úÖ VALIDATED (Tested in research, layers 7, 15, 23):")
for num in sorted([n for n in validated_layers if n in layer_numbers]):
    print(f"   Layer {num:2d}: {layer_numbers[num]}")
print()

print("‚ö†Ô∏è  AVOID - Very Early (0-6, too little structure):")
for num in sorted(avoid_early):
    print(f"   Layer {num:2d}: {layer_numbers[num]}")
print(f"   Total: {len(avoid_early)} layers\n")

print("‚ö†Ô∏è  AVOID - Very Late (30-31, anomalous behavior):")
for num in sorted(avoid_late):
    print(f"   Layer {num:2d}: {layer_numbers[num]}")
print(f"   Total: {len(avoid_late)} layers\n")

print("=" * 80)
print(f"\nüí° RECOMMENDATION: Use layers 15-24 for best results")
print(f"   Example: {layer_numbers[16] if 16 in layer_numbers else 'N/A'}")
print(f"   (Layer 16 is in the middle of the recommended range)")

üìä Layer Analysis for Bielik-1.5B (32 layers total)

‚úÖ RECOMMENDED (Mid-to-upper layers, 15-24):
   Layer 15: llamaforcausallm_model_layers_15_post_attention_layernorm
   Layer 16: llamaforcausallm_model_layers_16_post_attention_layernorm
   Layer 17: llamaforcausallm_model_layers_17_post_attention_layernorm
   Layer 18: llamaforcausallm_model_layers_18_post_attention_layernorm
   Layer 19: llamaforcausallm_model_layers_19_post_attention_layernorm
   Layer 20: llamaforcausallm_model_layers_20_post_attention_layernorm
   Layer 21: llamaforcausallm_model_layers_21_post_attention_layernorm
   Layer 22: llamaforcausallm_model_layers_22_post_attention_layernorm
   Layer 23: llamaforcausallm_model_layers_23_post_attention_layernorm
   Layer 24: llamaforcausallm_model_layers_24_post_attention_layernorm
   Total: 10 layers

‚úÖ VALIDATED (Tested in research, layers 7, 15, 23):
   Layer  7: llamaforcausallm_model_layers_7_post_attention_layernorm
   Layer 15: llamaforcausallm_model_layers_1

In [29]:
recommended_layer_names = [layer_numbers[num] for num in sorted(recommended_layers)]
validated_layer_names = [layer_numbers[num] for num in sorted([n for n in validated_layers if n in layer_numbers])]

print("üíæ Recommended layer names saved to variables:")
print(f"   'recommended_layer_names' = {len(recommended_layer_names)} layers (15-24)")
print(f"   'validated_layer_names' = {len(validated_layer_names)} layers (7, 15, 23)")
print()
print("üìù Quick access examples:")
if recommended_layer_names:
    print(f"   Recommended: recommended_layer_names[0] = '{recommended_layer_names[0]}'")
    print(f"   Middle layer: recommended_layer_names[{len(recommended_layer_names)//2}] = '{recommended_layer_names[len(recommended_layer_names)//2]}'")
if validated_layer_names:
    print(f"   Validated: validated_layer_names[1] = '{validated_layer_names[1] if len(validated_layer_names) > 1 else validated_layer_names[0]}'")

üíæ Recommended layer names saved to variables:
   'recommended_layer_names' = 10 layers (15-24)
   'validated_layer_names' = 3 layers (7, 15, 23)

üìù Quick access examples:
   Recommended: recommended_layer_names[0] = 'llamaforcausallm_model_layers_15_post_attention_layernorm'
   Middle layer: recommended_layer_names[5] = 'llamaforcausallm_model_layers_20_post_attention_layernorm'
   Validated: validated_layer_names[1] = 'llamaforcausallm_model_layers_15_post_attention_layernorm'
