## DeepSeek-Coder

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

deepseek_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct")
deepseek_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.54s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
type(deepseek_model)

transformers.models.llama.modeling_llama.LlamaForCausalLM

#### Finding Model Info

In [3]:
deepseek_model.config

LlamaConfig {
  "_name_or_path": "deepseek-ai/deepseek-coder-6.7b-instruct",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 32013,
  "eos_token_id": 32021,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": {
    "factor": 4.0,
    "type": "linear"
  },
  "rope_theta": 100000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use_cache": true,
  "vocab_size": 32256
}

In [3]:
len(deepseek_model.model.layers)

32

In [5]:
for layer in deepseek_model.model.layers:
    print(layer)

LlamaDecoderLayer(
  (self_attn): LlamaSdpaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (rotary_emb): LlamaLinearScalingRotaryEmbedding()
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
    (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
    (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm()
  (post_attention_layernorm): LlamaRMSNorm()
)
LlamaDecoderLayer(
  (self_attn): LlamaSdpaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (v_proj): Linear(in_features=4096, out_features=4096, bias=Fals

#### Training Specific Layers/Parameters

In [None]:
# Freeze first layer weights
for name, param in deepseek_model.named_parameters():
    if "layers.0" in name:
        param.requires_grad = False

# Freeze all except output layer
for name, param in deepseek_model.named_parameters():
    if "lm_head" not in name:
        param.requires_grad = False

In [18]:
for name, param in deepseek_model.named_parameters():
    print(name)

model.embed_tokens.weight
model.layers.0.self_attn.q_proj.weight
model.layers.0.self_attn.k_proj.weight
model.layers.0.self_attn.v_proj.weight
model.layers.0.self_attn.o_proj.weight
model.layers.0.mlp.gate_proj.weight
model.layers.0.mlp.up_proj.weight
model.layers.0.mlp.down_proj.weight
model.layers.0.input_layernorm.weight
model.layers.0.post_attention_layernorm.weight
model.layers.1.self_attn.q_proj.weight
model.layers.1.self_attn.k_proj.weight
model.layers.1.self_attn.v_proj.weight
model.layers.1.self_attn.o_proj.weight
model.layers.1.mlp.gate_proj.weight
model.layers.1.mlp.up_proj.weight
model.layers.1.mlp.down_proj.weight
model.layers.1.input_layernorm.weight
model.layers.1.post_attention_layernorm.weight
model.layers.2.self_attn.q_proj.weight
model.layers.2.self_attn.k_proj.weight
model.layers.2.self_attn.v_proj.weight
model.layers.2.self_attn.o_proj.weight
model.layers.2.mlp.gate_proj.weight
model.layers.2.mlp.up_proj.weight
model.layers.2.mlp.down_proj.weight
model.layers.2.inp

#### Using TorchViz (Attentions and Hidden States)

In [None]:
%pip install torchviz

In [None]:
from torchviz import make_dot
import torch

# Tokenize text
text = "def hello_world():\n\tprint('Hello World!')"  # Use proper indentation
input = tokenizer(text, return_tensors="pt")

# Extract token IDs
token_ids = input['input_ids']  # Option 1

# Alternative (if tokenizer doesn't return tensors)
# token_ids = tokenizer.convert_tokens_to_ids(input['input_ids'])

# Forward pass
output = model(token_ids, output_attentions=True)

from transformers import AutoTokenizer

# ... (previous code for tokenization and forward pass)

# Access specific tensors
hidden_states = output.hidden_states  # Assuming this is a relevant tensor
attentions = output.attentions  # Another relevant tensor (optional)

# Visualize hidden states
g = make_dot(hidden_states[0], params=dict(model.named_parameters()))  # Visualize the first layer
g.render("hidden_states", format="png")

# Visualize attentions (optional)
if attentions is not None:
  g = make_dot(attentions[0], params=dict(model.named_parameters()))  # Visualize the first attention layer
  g.render("attentions", format="png")



In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

# Tokenize text with attention_mask
text = "Write a hello world program in python"
inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)

# Generate text
generate_ids = model.generate(inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=512)

# Decode generated text
generated_text = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(generated_text)

# To visualize the model's architecture, you'll need to pass an input through it and then visualize the output
input_ids = tokenizer("This is a sample input", return_tensors="pt", truncation=True).input_ids
attention_mask = torch.ones_like(input_ids)  # Create a simple attention mask for this example

input_ids = input_ids.to(device)

with torch.no_grad():
    output = model(input_ids, attention_mask=attention_mask)

# Access hidden states and attentions (if available)
hidden_states = output.hidden_states
attentions = output.attentions

# Visualize hidden states
if hidden_states is not None:
    g = make_dot(hidden_states[0], params=dict(model.named_parameters()))
    g.render("hidden_states", format="png")

# Visualize attentions (optional)
if attentions is not None:
    g = make_dot(attentions[0], params=dict(model.named_parameters()))
    g.render("attentions", format="png")

## CodeLlama

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

codellama_model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
codellama_tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")

Loading checkpoint shards: 100%|██████████| 2/2 [00:28<00:00, 14.20s/it]


In [6]:
codellama_model.config

LlamaConfig {
  "_name_or_path": "codellama/CodeLlama-7b-Instruct-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 16384,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 1000000,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use_cache": true,
  "vocab_size": 32016
}

In [14]:
for layer in codellama_model.model.layers:
    print(layer)

LlamaDecoderLayer(
  (self_attn): LlamaSdpaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (rotary_emb): LlamaRotaryEmbedding()
  )
  (mlp): LlamaMLP(
    (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
    (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
    (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
    (act_fn): SiLU()
  )
  (input_layernorm): LlamaRMSNorm()
  (post_attention_layernorm): LlamaRMSNorm()
)
LlamaDecoderLayer(
  (self_attn): LlamaSdpaAttention(
    (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
    (o_pro

## StarCoder

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

starcoder_model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder2-7b")
starcoder_tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoder2-7b")

Loading checkpoint shards: 100%|██████████| 3/3 [00:28<00:00,  9.64s/it]


In [9]:
starcoder_model.config

Starcoder2Config {
  "_name_or_path": "bigcode/starcoder2-7b",
  "activation_function": "gelu",
  "architectures": [
    "Starcoder2ForCausalLM"
  ],
  "attention_dropout": 0.1,
  "attention_softmax_in_fp32": true,
  "bos_token_id": 0,
  "embedding_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_size": 4608,
  "initializer_range": 0.018042,
  "intermediate_size": 18432,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 16384,
  "mlp_type": "default",
  "model_type": "starcoder2",
  "norm_epsilon": 1e-05,
  "norm_type": "layer_norm",
  "num_attention_heads": 36,
  "num_hidden_layers": 32,
  "num_key_value_heads": 4,
  "residual_dropout": 0.1,
  "rope_theta": 1000000,
  "scale_attention_softmax_in_fp32": true,
  "scale_attn_weights": true,
  "sliding_window": 4096,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.42.4",
  "use_bias": true,
  "use_cache": true,
  "vocab_size": 49152
}

In [13]:
for layer in starcoder_model.model.layers:
    print(layer)

Starcoder2DecoderLayer(
  (self_attn): Starcoder2SdpaAttention(
    (q_proj): Linear(in_features=4608, out_features=4608, bias=True)
    (k_proj): Linear(in_features=4608, out_features=512, bias=True)
    (v_proj): Linear(in_features=4608, out_features=512, bias=True)
    (o_proj): Linear(in_features=4608, out_features=4608, bias=True)
    (rotary_emb): Starcoder2RotaryEmbedding()
  )
  (mlp): Starcoder2MLP(
    (c_fc): Linear(in_features=4608, out_features=18432, bias=True)
    (c_proj): Linear(in_features=18432, out_features=4608, bias=True)
    (act): PytorchGELUTanh()
  )
  (input_layernorm): LayerNorm((4608,), eps=1e-05, elementwise_affine=True)
  (post_attention_layernorm): LayerNorm((4608,), eps=1e-05, elementwise_affine=True)
)
Starcoder2DecoderLayer(
  (self_attn): Starcoder2SdpaAttention(
    (q_proj): Linear(in_features=4608, out_features=4608, bias=True)
    (k_proj): Linear(in_features=4608, out_features=512, bias=True)
    (v_proj): Linear(in_features=4608, out_features=5

In [12]:
type(starcoder_model)

transformers.models.starcoder2.modeling_starcoder2.Starcoder2ForCausalLM