In [1]:
import torch
import re

def load_ensemble(filepaths):
    def load_from_checkpoint(idx, filepath):
        parameters = torch.load(filepath)['state_dict']['bayesian_layer'].params
        parameters = {re.sub(r'model\.layers\.\d+\.', '', k): v for k, v in parameters.items() if v.numel() > 0}
        return parameters
    return [load_from_checkpoint(idx, filepath) for idx, filepath in enumerate(filepaths)]

parameters = load_ensemble(['logs/bayes/checkpoints/epoch=0-step=300.ckpt','logs/bayes/checkpoints/epoch=9-step=3000.ckpt'])

In [2]:
parameters

[{'self_attn.q_proj.weight': tensor([[ 8.3618e-03, -3.4790e-03,  7.8735e-03,  ..., -1.0548e-03,
           -1.6842e-03, -1.7662e-03],
          [ 3.4180e-03, -1.7166e-03,  1.3428e-02,  ..., -1.5991e-02,
           -3.9062e-03,  1.0443e-03],
          [ 7.3395e-03,  1.2207e-02, -1.0376e-02,  ...,  8.3745e-05,
           -2.6733e-02,  3.2501e-03],
          ...,
          [-6.5498e-03,  1.6846e-02,  1.4221e-02,  ..., -1.0132e-02,
           -1.6602e-02,  2.4048e-02],
          [-1.7944e-02, -2.4902e-02,  1.0803e-02,  ..., -3.4424e-02,
            2.3766e-03, -2.2217e-02],
          [-3.6640e-03, -1.7090e-02,  1.1414e-02,  ...,  2.1152e-03,
           -5.5552e-04, -5.3406e-04]], device='cuda:0', dtype=torch.float16,
         requires_grad=True),
  'self_attn.k_proj.weight': tensor([[ 0.0126, -0.0272,  0.0303,  ...,  0.0481, -0.0056,  0.0067],
          [-0.0266, -0.0155, -0.0435,  ..., -0.0216, -0.0515,  0.0091],
          [ 0.0107,  0.0065,  0.0159,  ..., -0.0048, -0.0022, -0.0027],
    

In [3]:
%load_ext autoreload
%autoreload 2

from llama3.modules.bayesllama_copy import BayesLlamaForCausalLM

bayes = BayesLlamaForCausalLM.from_pretrained("Meta-Llama-3-8B").to("cuda")
bayes.load_bayesian_layers(parameters)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.10it/s]
Some weights of BayesLlamaForCausalLM were not initialized from the model checkpoint at Meta-Llama-3-8B and are newly initialized: ['model.bayesian_layers.0.input_layernorm.weight', 'model.bayesian_layers.0.mlp.down_proj.weight', 'model.bayesian_layers.0.mlp.gate_proj.weight', 'model.bayesian_layers.0.mlp.up_proj.weight', 'model.bayesian_layers.0.post_attention_layernorm.weight', 'model.bayesian_layers.0.self_attn.k_proj.weight', 'model.bayesian_layers.0.self_attn.o_proj.weight', 'model.bayesian_layers.0.self_attn.q_proj.weight', 'model.bayesian_layers.0.self_attn.v_proj.weight', 'model.bayesian_layers.1.input_layernorm.weight', 'model.bayesian_layers.1.mlp.down_proj.weight', 'model.bayesian_layers.1.mlp.gate_proj.weight', 'model.bayesian_layers.1.mlp.up_proj.weight', 'model.bayesian_layers.1.post_attention_layernorm.weight', 'model.bayesian_layers.1.self_attn.k_pr

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Meta-Llama-3-8B")

inputs = "This is a test"
inputs = tokenizer(inputs, return_tensors="pt").input_ids

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
out = bayes(inputs.to("cuda"), return_dict=True)