In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [2]:
from transformers import LlamaConfig, LlamaForCausalLM, AutoTokenizer

In [3]:
half_config = LlamaConfig.from_pretrained('mesolitica/llama-13b-hf-32768-fpf', 
                                             num_hidden_layers = 9)
half_config

LlamaConfig {
  "_name_or_path": "fpf-13b-32k/checkpoint-7650",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 5120,
  "initializer_range": 0.02,
  "intermediate_size": 13824,
  "max_position_embeddings": 32768,
  "model_type": "llama",
  "num_attention_heads": 40,
  "num_hidden_layers": 9,
  "num_key_value_heads": 40,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.35.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [4]:
small = LlamaForCausalLM(half_config)

In [5]:
model = LlamaForCausalLM.from_pretrained('mesolitica/llama-13b-hf-32768-fpf')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
import torch.nn as nn

def copy_layers(src_layers, dest_layers, layers_to_copy):
    layers_to_copy = nn.ModuleList([src_layers[i] for i in layers_to_copy])
    assert len(dest_layers) == len(layers_to_copy), f"{len(dest_layers)} != {len(layers_to_copy)}"
    dest_layers.load_state_dict(layers_to_copy.state_dict())

layers_to_copy = [0,1,2,3,4,5,6,7,8]

copy_layers(model.model.layers, small.model.layers, layers_to_copy)

In [7]:
small.model.embed_tokens.load_state_dict(model.model.embed_tokens.state_dict())

<All keys matched successfully>

In [8]:
small.model.norm.load_state_dict(model.model.norm.state_dict())

<All keys matched successfully>

In [9]:
small.lm_head.load_state_dict(model.lm_head.state_dict())

<All keys matched successfully>

In [11]:
sum(p.numel() for p in small.parameters())

3182525440

In [14]:
import torch

torch.bfloat16

torch.bfloat16

In [15]:
small = small.type(torch.bfloat16)

In [16]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama-13b-hf-32768-fpf')

tokenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [17]:
tokenizer.push_to_hub('llama-3.2b-hf-32768', organization='huseinzol05')



tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/huseinzol05/llama-3.2b-hf-32768/commit/de660ceaad3cc6f9613156ca57bae6edc5461384', commit_message='Upload tokenizer', commit_description='', oid='de660ceaad3cc6f9613156ca57bae6edc5461384', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
small.push_to_hub('llama-3.2b-hf-32768', organization='huseinzol05', safe_serialization=True)

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.39G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/huseinzol05/llama-3.2b-hf-32768/commit/71e759804927662962bd14fe3fe40dee1fdeb99e', commit_message='Upload LlamaForCausalLM', commit_description='', oid='71e759804927662962bd14fe3fe40dee1fdeb99e', pr_url=None, pr_revision=None, pr_num=None)