# Lesson 4: Preparing your model for training

In [1]:
# Ignore insignificant warnings (ex: deprecation warnings)
import warnings
warnings.filterwarnings('ignore')

# Set a seed value for reproducibility
import torch

def fix_torch_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

fix_torch_seed()

## 1. Model configuration

You'll configure models based on Meta's Llama family of models. The transformers library has several tools for working with these models, which you can read about [here](https://huggingface.co/docs/transformers/main/en/model_doc/qwen2).

Start by creating a `Qwen2Config` object to configure the architecture of the model:

In [2]:
from transformers import Qwen2Config, LlamaConfig

llama = LlamaConfig()
config = Qwen2Config()

print(config)


Qwen2Config {
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 22016,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "transformers_version": "4.49.0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}



In [3]:
print(llama)

LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.49.0",
  "use_cache": true,
  "vocab_size": 32000
}



In [4]:
config.num_hidden_layers = 12       # reduced from 32 to 12
config.hidden_size = 1024           # reduced 1/4 from 4096 to 1024
config.intermediate_size = 4096     # reduced 1/3 from 22016 to 4096 (dimension of MLP representations)
config.num_key_value_heads = 8      # reduced 1/4 from 32 to 8 (defaults to num_attention_heads=32)
config.torch_dtype = "bfloat16"     # for half-precision training
config.use_cache = False            # 'True' is incompitable w/ gradient checkpointing
config.model_type
print(config)

Qwen2Config {
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 32,
  "num_hidden_layers": 12,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.49.0",
  "use_cache": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}



## 2. Weight initialization

In the next sections, you'll explore four different ways to initialize the weights of a model for training:
1. Random weight initialization
2. Using an existing model for continued pre-training
3. Downscaling an existing model
4. Upscaling an existing model

### Random weight initialization

Randomly initializing model weights sets all weights to values from a truncated normal distribution with mean 0 and standard deviation of 0.02. Values beyond  2-sigma from the mean are set to 0.

In [5]:
from transformers import Qwen2ForCausalLM
model = Qwen2ForCausalLM(config)
print(model)


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-11): 12 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (k_proj): Linear(in_features=1024, out_features=256, bias=True)
          (v_proj): Linear(in_features=1024, out_features=256, bias=True)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1024,), eps=1e-06)
    (rotary_emb): Qw

In [6]:
def print_nparams(model):
    '''Calculate the total number of model parameters'''
    nparams = sum(p.numel() for p in model.parameters())
    print(f"Total number of parameters is: {nparams}")

print_nparams(model) # 493661184 -> 493M

Total number of parameters is: 493661184


Take a look at a sample of the weights in a single layer:

In [7]:
layer_name = "model.layers.0.self_attn.q_proj.weight"

for name, param in model.named_parameters():
    if name == layer_name:
        print(f"First 30 weights of layer '{layer_name}': ")
        print(param.data.view(-1)[:30])
        break

First 30 weights of layer 'model.layers.0.self_attn.q_proj.weight': 
tensor([ 1.9314e-02, -1.2728e-02, -3.3356e-02, -5.4111e-03, -2.0671e-05,
         7.3875e-03, -1.1259e-02,  8.7001e-03,  2.5883e-02, -1.0809e-02,
        -8.6876e-04, -1.3179e-02, -4.6141e-03,  2.0350e-02, -3.2663e-02,
         2.6889e-02, -2.3778e-02, -1.4012e-02, -3.5504e-02,  6.4026e-03,
         1.5647e-02,  6.7447e-03,  1.2999e-03, -4.6823e-03,  6.5911e-04,
        -5.3404e-03, -1.8712e-02,  3.2474e-02,  1.0103e-02, -7.9912e-03])


Try using the model for inference:

In [8]:
# Load a tokenizer from Qwen2
from transformers import Qwen2Tokenizer
tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")

from transformers import TextStreamer

prompt = "I am an AI Developer. I love"

inputs = tokenizer(prompt, return_tensors='pt')

streamer = TextStreamer(
    tokenizer=tokenizer,
    skip_prompt=True,
    skip_special_tokens=True
)

output = model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False
)

 _('设置了 _('设置了 _('设置了 _('在传奇失落CBS设置了 @$ 이제 @$🌌 @$🌌 @$🌌 @$🌌 @$ @$ @$ @$ @$.DropDownList casualty @$ @$ @$.DropDownList casualty @$.DropDownList casualty @$.DropDownList casualty @$ manners истории истории истории истории истории истории истории истории истории истории истории истории истории истории истории истории истории истории историиynchronized grossoutcome grossoutcomeMitchoutcomeMitchoutcomeMitchoutcomeMitchoutcomeMitchoutcomeMitchoutcomeMitchoutcomeMitchoutcomeMitchoutcomeMitchoutcomeMitchoutcome истории истории истории историиoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcomeoutcome圄outcome Detected Detected Detected Detectedoutcome Detected Detected Detected Detectedoutcome Detectedoutcomeoutcomeoutcome


Remove the model from memory to avoid crashing the kernel:

In [9]:
# NOTE: We're running large models in a limited environment. Run me if you encounter any memory issues.
import gc
del model
del streamer
del output
gc.collect()

66

## 3. Reuse general pretrained model weights

If you load an existing model, you can use it as is to continue pretraining on new data.

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name_or_path = "upstage/TinySolar-248m-4k"
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map="cpu",
    torch_dtype=torch.bfloat16,    
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

print_nparams(model) #  248013824 => 248M

Total number of parameters is: 248013824


### Downscaling from a general pretrained model

Here you'll downscale the HuggingFaceTB/SmolLM2-360M-Instruct model from a 12 layer model to a 10 layer model.

In [11]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=256, bias=False)
          (v_proj): Linear(in_features=1024, out_features=256, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      )
    )
    (norm): LlamaRMSNorm((1024,), eps=1e-06)
    (rotary_emb): 

Remove the middle two layers (layers 15 and 16) and update the configuration:

In [12]:
from transformers import AutoConfig
layers = model.model.layers
model.model.layers = layers[:5] + layers[-5:]

config = AutoConfig.from_pretrained(
    model_name_or_path,    
    num_hidden_layers=len(model.model.layers),
)
model.config = config

print_nparams(model)  # 217601024 => 217M

Total number of parameters is: 217601024


Clear the memory to avoid crashing the kernel:

In [13]:
# NOTE: We're running large models in a limited environment. Run me if you encounter any memory issues.
import gc
del model
gc.collect()

53

### Depth Upscaling from a general pretrained model

Here you are going to upscale the HuggingFaceTB/SmolLM2-360M-Instruct model from 12 layers to 16 layers. Here are the steps you'll take:
1. Configure a 16 layer model and initialize it with random weights
2. Load the 12 layer HuggingFaceTB/SmolLM2-360M-Instruct model into memory
3. Copy the bottom 8 and top 8 layers from the 12 layer model and use them to overwrite the random weights of the 16 layer model
4. Copy over the embedding and classifying layers to replace the randomly initialized counterparts in the 16 layer model

In [14]:
# config = LlamaConfig(
#     num_hidden_layers=16,  # We want our model to have 16 final layers
#     hidden_size=1024,
#     intermediate_size=4096,
#     num_attention_heads=32,
#     num_key_value_heads=8,
#     torch_dtype="bfloat16",
#     use_cache=False 
# )
# print(config)

In [15]:
config = Qwen2Config(
    num_hidden_layers=16,  # We want our model to have 16 final layers
    hidden_size=1024,
    intermediate_size=4096,
    num_attention_heads=32,
    num_key_value_heads=8,
    torch_dtype="bfloat16",
    use_cache=False 
)
print(config)

Qwen2Config {
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.49.0",
  "use_cache": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}



In [16]:

model = Qwen2ForCausalLM(config)
model = model.to(dtype=torch.bfloat16) # convert to bfloat16
print_nparams(model) # 554492928 -> 554M

Total number of parameters is: 554492928


In [17]:
model_name_or_path = "upstage/TinySolar-248m-4k"
pretrained_model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map="cpu",
    torch_dtype=torch.bfloat16,    
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

print_nparams(pretrained_model) #  248013824 => 248M

Total number of parameters is: 248013824


In [18]:
from copy import deepcopy

model.model.layers = deepcopy(pretrained_model.model.layers[:-4]) \
    + deepcopy(pretrained_model.model.layers[4:])

model.model.embed_tokens = deepcopy(pretrained_model.model.embed_tokens)

model.lm_head = deepcopy(pretrained_model.lm_head)

print(model.config)

Qwen2Config {
  "_attn_implementation_autoset": true,
  "attention_dropout": 0.0,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 32768,
  "max_window_layers": 28,
  "model_type": "qwen2",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.49.0",
  "use_cache": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}



In [None]:
print_nparams(model) # 308839424 -> 308M

Total number of parameters is: 308839424


In [20]:
# Load a tokenizer from Qwen2
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

from transformers import TextStreamer

prompt = "I am an AI Developer. I love"

inputs = tokenizer(prompt, return_tensors='pt')

streamer = TextStreamer(
    tokenizer=tokenizer,
    skip_prompt=True,
    skip_special_tokens=True
)

output = model.generate(
    **inputs,
    streamer=streamer,
    use_cache=True,
    max_new_tokens=128,
    do_sample=False,

)

the way you can use the data to make a better and more efficient use of your time.

A: I've been using the data-mining-in-the-future-of-the-difference-between-the-difference-between-the-difference-between-the-difference-between-the-difference-between-the-difference-between-the-difference-between-the-difference-and-the-the-difference-between-the-difference-and-the-the-difference-between-the-difference-


### Save the model to disk

Note the new model name here which reflects the 308 million parameters of the new, upscaled model. 

In [22]:
model.save_pretrained('./data/TinySolar-308m-4k-init')