In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn
import numpy as np
from LoraHelpers import LoraModule, convert_model_to_lora_model, change_lora_alpha

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
tokenizer = AutoTokenizer.from_pretrained("llama-3.2-1b-inst")
model = AutoModelForCausalLM.from_pretrained("llama-3.2-1b-inst")

In [41]:
s = "hello, hello!\n\nI"
ts = torch.tensor(tokenizer([s])["input_ids"])
model.forward(ts).logits[0]

tensor([[ 2.8333,  3.5809,  7.0268,  ..., -1.2470, -1.2469, -1.2468],
        [14.2424,  4.3746,  4.5395,  ..., -1.9085, -1.9087, -1.9092],
        [ 8.1138, -0.6880,  0.9197,  ..., -0.8815, -0.8822, -0.8828],
        [18.7669,  7.0025,  4.4708,  ..., -0.8601, -0.8603, -0.8607],
        [ 6.7835, 11.4848,  9.7997,  ..., -1.0438, -1.0439, -1.0437],
        [ 8.8547, 10.7931,  5.9058,  ..., -0.3585, -0.3590, -0.3580]],
       grad_fn=<SelectBackward0>)

In [22]:
tokenizer("hello"+tokenizer.eos_token, tokenizer.eos_token)

{'input_ids': [128000, 15339, 128009, 128000, 128009], 'attention_mask': [1, 1, 1, 1, 1]}

In [42]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [43]:
convert_model_to_lora_model(model)

In [44]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): LoraModule(
            (original_module): Linear(in_features=2048, out_features=2048, bias=False)
            (lora_module): Sequential(
              (0): Linear(in_features=2048, out_features=8, bias=False)
              (1): Linear(in_features=8, out_features=2048, bias=False)
            )
          )
          (k_proj): LoraModule(
            (original_module): Linear(in_features=2048, out_features=512, bias=False)
            (lora_module): Sequential(
              (0): Linear(in_features=2048, out_features=8, bias=False)
              (1): Linear(in_features=8, out_features=512, bias=False)
            )
          )
          (v_proj): LoraModule(
            (original_module): Linear(in_features=2048, out_features=512, bias=False)
            (lora_module): Sequential(

In [6]:
model.model.layers[0].self_attn.q_proj.alpha

1.0

In [11]:
change_lora_alpha(model, 16.)

In [12]:
model.model.layers[0].self_attn.q_proj.alpha

16.0

In [6]:
for name, module in model.named_parameters():
    module.requires_grad = False
    if module.ndim==1:
        module.data = module.data.to(torch.float32)
        continue
    names = name.split('.')[:-1]
    module_pointer = model
    module_pointer_parent = None
    for layer in names:
        module_pointer_parent = module_pointer
        module_pointer = getattr(module_pointer, layer)
    if type(module_pointer)==nn.modules.linear.Linear:
        lora_module = LoraModule(module_pointer)
        setattr(module_pointer_parent, names[-1], lora_module)
        

    
    # print(type(module_pointer), name)

False

In [45]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

model.layers.0.self_attn.q_proj.lora_module.0.weight
model.layers.0.self_attn.q_proj.lora_module.1.weight
model.layers.0.self_attn.k_proj.lora_module.0.weight
model.layers.0.self_attn.k_proj.lora_module.1.weight
model.layers.0.self_attn.v_proj.lora_module.0.weight
model.layers.0.self_attn.v_proj.lora_module.1.weight
model.layers.0.self_attn.o_proj.lora_module.0.weight
model.layers.0.self_attn.o_proj.lora_module.1.weight
model.layers.0.mlp.gate_proj.lora_module.0.weight
model.layers.0.mlp.gate_proj.lora_module.1.weight
model.layers.0.mlp.up_proj.lora_module.0.weight
model.layers.0.mlp.up_proj.lora_module.1.weight
model.layers.0.mlp.down_proj.lora_module.0.weight
model.layers.0.mlp.down_proj.lora_module.1.weight
model.layers.1.self_attn.q_proj.lora_module.0.weight
model.layers.1.self_attn.q_proj.lora_module.1.weight
model.layers.1.self_attn.k_proj.lora_module.0.weight
model.layers.1.self_attn.k_proj.lora_module.1.weight
model.layers.1.self_attn.v_proj.lora_module.0.weight
model.layers.1.

In [14]:
tokenizer("Hello")

{'input_ids': [128000, 9906], 'attention_mask': [1, 1]}

In [13]:
a = model.generate(torch.tensor(tokenizer("Hello")["input_ids"]).unsqueeze(0))[0]
"".join([tokenizer.decode(token) for token in a])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


'<|begin_of_text|>Helloendetuptordoustopl RTS+="usto sync FocusSuffixaby evacuate afterwards afterward surely Caryucks'

In [14]:
change_lora_alpha(model, 0.)

In [15]:
a = model.generate(torch.tensor(tokenizer("Hello")["input_ids"]).unsqueeze(0))[0]
"".join([tokenizer.decode(token) for token in a])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


"<|begin_of_text|>Hello, I'm excited to be a part of this community. I'm looking for a new"