In [1]:
import pickle
from accelerate import init_empty_weights

from transformers import LlamaConfig, LlamaForCausalLM
from utils.funs import create_dict
from utils.scores import get_scores

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
models, dir = create_dict('language-models', 'LLAMA.pkl')

path = ["model.layers.", 
        ".self_attn.q_proj.weight", 
        ".self_attn.k_proj.weight"]

In [None]:
'LLAMA 2 7b (l = 32, d = 4096, h = 32 ; tot num parameters 7B)'
model_name = "meta-llama/Llama-2-7b-hf"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = "LLAMA")

'LLAMA 2 13b (l = 32, d = 4096, h = 32 ; tot num parameters 13B)'
model_name = "meta-llama/Llama-2-13b-hf"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = "LLAMA")

'LLAMA 2 70b (l = 32, d = 4096, h = 32 ; tot num parameters 13B)'
model_name = "meta-llama/Llama-2-70b-hf"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = 'grouped-attention')

'LLAMA 2 7b (l = 32, d = 4096, h = 32 ; tot num parameters 7B)'
model_name = "meta-llama/Llama-2-7b-chat-hf"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = "LLAMA")

'LLAMA 2 13b (l = 32, d = 4096, h = 32 ; tot num parameters 13B)'
model_name = "meta-llama/Llama-2-13b-chat-hf"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = "LLAMA")

'LLAMA 3 8b (l = 32, d = 4096, h = 32 ; tot num parameters 8B)'
model_name = "meta-llama/Meta-Llama-3-8B"
config =  LlamaConfig.from_pretrained(model_name)
model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = True,
                    attn_type = 'grouped-attention')

'LLAMA 3 70b (l = 32, d = 4096, h = 32 ; tot num parameters 8B)'
model_name = "meta-llama/Meta-Llama-3-70B"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = 'grouped-attention')

'LLAMA 3.1 8b (l = 32, d = 4096, h = 32 ; tot num parameters 8B)'
model_name = "meta-llama/Llama-3.1-8B"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = 'grouped-attention')

'LLAMA 3.1 70b (l = 32, d = 4096, h = 32 ; tot num parameters 70B)'
model_name = "meta-llama/Llama-3.1-70B"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = 'grouped-attention')

'LLAMA 3.1 405b (l = 32, d = 4096, h = 32 ; tot num parameters 70B)'
model_name = "meta-llama/Llama-3.1-405B"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = 'grouped-attention')

'LLAMA 3.2 1b'
model_name = "meta-llama/Llama-3.2-1B"
config =  LlamaConfig.from_pretrained(model_name)
model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = True,
                    attn_type = 'grouped-attention')

'LLAMA 3.2 3b'
model_name = "meta-llama/Llama-3.2-3B"
config =  LlamaConfig.from_pretrained(model_name)
with init_empty_weights(): 
    model = LlamaForCausalLM(config)
models = get_scores(models,
                    model_name, model, config,
                    path, download_model = False,
                    attn_type = 'grouped-attention')

# 'LLAMA 3.3 70b'
# model_name = "meta-llama/Llama-3.3-70B-Instruct"
# config =  LlamaConfig.from_pretrained(model_name)
# with init_empty_weights(): 
#     model = LlamaForCausalLM(config)
# models = get_scores(models,
#                     model_name, model, config,
#                     path, download_model = False,
#                     attn_type = 'grouped-attention')

In [12]:
'save'
with open(dir, 'wb') as file:
    pickle.dump(models, file)

In [4]:
models['meta-llama/Meta-Llama-3-8B']

[LlamaConfig {
   "architectures": [
     "LlamaForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 128000,
   "eos_token_id": 128001,
   "head_dim": 128,
   "hidden_act": "silu",
   "hidden_size": 4096,
   "initializer_range": 0.02,
   "intermediate_size": 14336,
   "max_position_embeddings": 8192,
   "mlp_bias": false,
   "model_type": "llama",
   "num_attention_heads": 32,
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "rope_scaling": null,
   "rope_theta": 500000.0,
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.48.0",
   "use_cache": true,
   "vocab_size": 128256
 },
 array([0.5001514 , 0.50005603, 0.5000791 , 0.50004071, 0.50014716,
        0.50026643, 0.4999637 , 0.50008237, 0.49981049, 0.50026822,
        0.49986517, 0.50003099, 0.50017875, 0.49992201, 0.50004041,
        0.4999899 , 0.50005066, 0.50001794, 0.49991831, 0.5001480