In [1]:
!pip install -q huggingface_hub
import torch
import os
from huggingface_hub import snapshot_download

version = 15 # 15, 42, or 110
output_folder = f"llama2.c-stories{version}M"

snapshot_download(
		repo_id="meta-llama/Llama-2-7b-chat-hf",
		allow_patterns=["*.json", "tokenizer.model"],
		ignore_patterns=["model.safetensors.index.json", 'pytorch_model.bin.index.json'],
		local_dir=output_folder,
		local_dir_use_symlinks=False
)
!ls $output_folder

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading (…)d7450235/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

config.json             special_tokens_map.json tokenizer.model
generation_config.json  tokenizer.json          tokenizer_config.json


In [2]:
data = torch.load(f"stories{version}M.pt")

if data['config'].get('vocab_source', 'llama2') != 'llama2':
	raise NotImplementedError("No custom vocab support yet")

renamed_data = {}
for k, v in data["model"].items():
    k = k.replace("layers", "model.layers")
    k = k.replace(".feed_forward.w1", ".mlp.gate_proj")
    k = k.replace(".feed_forward.w2", ".mlp.down_proj")
    k = k.replace(".feed_forward.w3", ".mlp.up_proj")
    k = k.replace(".attention.wq", ".self_attn.q_proj")
    k = k.replace(".attention.wk", ".self_attn.k_proj")
    k = k.replace(".attention.wv", ".self_attn.v_proj")
    k = k.replace(".attention.wo", ".self_attn.o_proj")
    k = k.replace("norm.weight", "model.norm.weight")
    k = k.replace("output.weight", "lm_head.weight")
    k = k.replace("ffn_model.norm", "post_attention_layernorm")
    k = k.replace("attention_model.norm", "input_layernorm")
    k = k.replace("tok_embeddings.weight", "model.embed_tokens.weight")
    renamed_data[k] = v

list(renamed_data.keys())[:5]

['model.embed_tokens.weight',
 'model.layers.0.self_attn.q_proj.weight',
 'model.layers.0.self_attn.k_proj.weight',
 'model.layers.0.self_attn.v_proj.weight',
 'model.layers.0.self_attn.o_proj.weight']

In [3]:
os.makedirs(output_folder, exist_ok=True)
torch.save(renamed_data, f"{output_folder}/pytorch_model.bin")

In [4]:
from model import ModelArgs
from dataclasses import asdict

args = ModelArgs(**data['model_args'])
args = asdict(args)
args

{'dim': 288,
 'n_layers': 6,
 'n_heads': 6,
 'n_kv_heads': 6,
 'vocab_size': 32000,
 'hidden_dim': 768,
 'multiple_of': 32,
 'norm_eps': 1e-05,
 'max_seq_len': 256,
 'dropout': 0.0}

In [5]:
args_map = {
    'dim': 'hidden_size',
    'n_heads': 'num_attention_heads',
    'n_layers': 'num_hidden_layers',
    'max_seq_len': 'max_position_embeddings',
    'vocab_size': 'vocab_size',
    'hidden_dim': 'intermediate_size',
    'n_kv_heads': 'num_key_value_heads',
    'norm_eps': 'rms_norm_eps',
}

renamed_args = { 'tie_word_embeddings': True }
for k,v in args_map.items():
    renamed_args[v] = args.get(k)

renamed_args

{'tie_word_embeddings': True,
 'hidden_size': 288,
 'num_attention_heads': 6,
 'num_hidden_layers': 6,
 'max_position_embeddings': 256,
 'vocab_size': 32000,
 'intermediate_size': 768,
 'num_key_value_heads': 6,
 'rms_norm_eps': 1e-05}

In [6]:
!pip install -q transformers
from transformers import LlamaConfig

config = LlamaConfig(**renamed_args)
config.save_pretrained(output_folder)
config

LlamaConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 288,
  "initializer_range": 0.02,
  "intermediate_size": 768,
  "max_position_embeddings": 256,
  "model_type": "llama",
  "num_attention_heads": 6,
  "num_hidden_layers": 6,
  "num_key_value_heads": 6,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "tie_word_embeddings": true,
  "transformers_version": "4.32.1",
  "use_cache": true,
  "vocab_size": 32000
}

In [7]:
import json

tokenizer_config = {
  "bos_token": {
    "__type": "AddedToken",
    "content": "<s>",
    "lstrip": False,
    "normalized": True,
    "rstrip": False,
    "single_word": False
  },
  "clean_up_tokenization_spaces": False,
  "eos_token": {
    "__type": "AddedToken",
    "content": "</s>",
    "lstrip": False,
    "normalized": True,
    "rstrip": False,
    "single_word": False
  },
  "model_max_length": args['max_seq_len'],
  "pad_token": None,
  "sp_model_kwargs": {},
  "tokenizer_class": "LlamaTokenizer",
  "unk_token": {
    "__type": "AddedToken",
    "content": "<unk>",
    "lstrip": False,
    "normalized": True,
    "rstrip": False,
    "single_word": False
  },
  "use_default_system_prompt": True
}

with open(f"{output_folder}/tokenizer_config.json", 'w') as f:
		json.dump(tokenizer_config, f, indent=2)


In [8]:
gen_config_path = f"{output_folder}/generation_config.json"

# Assert `generation_config.json` in folder
assert os.path.exists(gen_config_path)

# Overwrite`max_length` to 256
with open(gen_config_path, "r") as f:
		config = json.load(f)
		config["max_length"] = 256
		config['temperature'] = 0.0001
		config['top_p'] = 1
with open(gen_config_path, "w") as f:
		json.dump(config, f, indent=2)

In [9]:
!cp tokenizer.model $output_folder/tokenizer.model

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer

output_folder = 'Xenova/llama2.c-stories15M'

tokenizer = AutoTokenizer.from_pretrained(output_folder) 
model = AutoModelForCausalLM.from_pretrained(output_folder)

inputs = tokenizer.encode("Lily was", return_tensors="pt")
outputs = model.generate(inputs, max_length=256)
print(tokenizer.decode(outputs[0]))
# <s> Once upon a time, there was a big bear named Benny was a very big bear.

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> Lily was a squir toy of a girl who liked toy. She was a girl. She was her name her favorite friend. She was a boy. She was three-ar. She had a bit. She was aunt, too. She was called Lily, butter. She was a little girl who was three-d-die was three.
One day, and she was Lily. She was called her mommy. She was called her age.
One day, and she was called her first. She was called Lily'dusted. She was a bitter. She was a little girl.
One day. She was a bit-dies. She was a little Lily. She was three years old.
One day, and she was a bit- a bit-Mine and she was a bit. She was three years old. She was a bit. She was a bit-Lola.
One day, and she was a bitter. She was a bit, three-eyed upstie. She was a bitter. She was a bit-f, she was a little girl.
One day, and she was a bit. She was a bit.
