In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
from transformers import LlamaConfig, LlamaForCausalLM, AutoTokenizer

In [4]:
half_config = LlamaConfig.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3', 
                                             num_hidden_layers = 10)
half_config

LlamaConfig {
  "_name_or_path": "fpf-1.1b-instructions-16k-call-v2/checkpoint-9000",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 32768,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 10,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.36.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [5]:
small = LlamaForCausalLM(half_config)

In [6]:
sum(p.numel() for p in small.parameters())

571516928

In [7]:
model = LlamaForCausalLM.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3')

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [21]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [9]:
list(range(22))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [12]:
import torch.nn as nn

def copy_layers(src_layers, dest_layers, layers_to_copy):
    layers_to_copy = nn.ModuleList([src_layers[i] for i in layers_to_copy])
    assert len(dest_layers) == len(layers_to_copy), f"{len(dest_layers)} != {len(layers_to_copy)}"
    dest_layers.load_state_dict(layers_to_copy.state_dict())

layers_to_copy = [0,1,2,3,4,17,18,19,20,21]

copy_layers(model.model.layers, small.model.layers, layers_to_copy)

In [13]:
small.model.embed_tokens.load_state_dict(model.model.embed_tokens.state_dict())

<All keys matched successfully>

In [14]:
small.model.norm.load_state_dict(model.model.norm.state_dict())

<All keys matched successfully>

In [15]:
small.lm_head.load_state_dict(model.lm_head.state_dict())

<All keys matched successfully>

In [19]:
import torch
small = small.type(torch.bfloat16)

In [20]:
small.push_to_hub('huseinzol05/570M-tinyllama')

model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/huseinzol05/570M-tinyllama/commit/c7cf99f56206b8f4a9693baf325506597a3ed36c', commit_message='Upload LlamaForCausalLM', commit_description='', oid='c7cf99f56206b8f4a9693baf325506597a3ed36c', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-tinyllama-1.1b-16k-instructions-v3')

tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

In [23]:
tokenizer.push_to_hub('huseinzol05/570M-tinyllama')

CommitInfo(commit_url='https://huggingface.co/huseinzol05/570M-tinyllama/commit/4487f41db06a786f861381468c41695b1a5cf56a', commit_message='Upload tokenizer', commit_description='', oid='4487f41db06a786f861381468c41695b1a5cf56a', pr_url=None, pr_revision=None, pr_num=None)