In [1]:
import torch
import datasets

In [2]:
label2id = {
	'xIntent': 0,
	'xReact': 1,
	'xAttr': 2,
	'xEffect': 3,
	'xWant': 4,
	'xNeed': 5
}
id2label = {
	0: 'xIntent',
	1: 'xReact',
	2: 'xAttr',
	3: 'xEffect',
	4: 'xWant',
	5: 'xNeed'
}

In [3]:
from datasets import load_dataset

ds = load_dataset("allenai/soda")

In [4]:
unused = list(ds['train'].features.keys())
unused.remove("relation")
unused.remove("narrative")
lim_ds = ds.remove_columns(unused)


In [5]:
cast_ds = lim_ds.cast(datasets.Features(
	{
		"relation": datasets.ClassLabel(names=['xIntent', 'xReact', 'xAttr', 'xEffect', 'xWant', 'xNeed'], id=[0,1,2,3,4,5]),
		"narrative": datasets.Value(dtype='string', id=None)
	}
))
cast_ds['train'].features['relation'].str2int('xWant')
ren_ds = cast_ds.rename_columns({'relation': 'labels', 'narrative': 'input_ids'})
ren_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 1191582
    })
    validation: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 146346
    })
    test: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 148968
    })
})

In [6]:
from transformers import LlamaForCausalLM, AutoTokenizer, LlamaForSequenceClassification
# _attn_implementation: flash_attention_2
model = LlamaForSequenceClassification.from_pretrained(
    "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
	torch_dtype="auto", device_map="auto", low_cpu_mem_usage=True
)
tokenizer = AutoTokenizer.from_pretrained("BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf")
model.train()

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf and are newly initialized: ['score.codebooks', 'score.codes', 'score.scales']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): QuantizedLinear()
          (k_proj): QuantizedLinear()
          (v_proj): QuantizedLinear()
          (o_proj): QuantizedLinear()
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): QuantizedLinear()
          (up_proj): QuantizedLinear()
          (down_proj): QuantizedLinear()
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (score): QuantizedLinear()
)

In [7]:
from peft import AdaLoraConfig, get_peft_model, IA3Model, IA3Config, LoraConfig

ada_lora_config = AdaLoraConfig(
    r=8,
    init_r=12,
    tinit=200,
    tfinal=1000,
    deltaT=10,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    modules_to_save=["score"],
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.01,
)
ia3_config = IA3Config(
    peft_type="IA3",
    target_modules=["key", "value", "dense"],
    feedforward_modules=["dense"],
)

peft_model = get_peft_model(model, lora_config)
peft_model.print_trainable_parameters()

trainable params: 2,252,800 || all params: 270,651,910 || trainable%: 0.8323606509926348


In [8]:
peft_model

PeftModel(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.AqlmLoraLinear(
                (base_layer): QuantizedLinear()
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.01, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.AqlmLoraLinear(
                (base_layer): QuantizedLinear()
                (lora_drop

In [9]:
peft_model(
	input_ids = torch.LongTensor([tokenizer.build_inputs_with_special_tokens(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("I want pumpkin pie!!!!")))]).cuda(),
	labels = torch.LongTensor([1]).cuda()
)

SequenceClassifierOutputWithPast(loss=tensor(1.7920, device='cuda:0', dtype=torch.float16,
       grad_fn=<NllLossBackward0>), logits=tensor([[0., 0., 0., 0., 0., -0.]], device='cuda:0', dtype=torch.float16,
       grad_fn=<IndexBackward0>), past_key_values=((tensor([[[[ 1.9211e-02, -3.7048e-02,  1.1246e-02,  ..., -1.2733e-02,
            4.2558e-04, -9.5596e-03],
          [ 4.1895e-01,  3.0469e-01,  1.9055e-01,  ...,  8.6609e-02,
            4.5105e-02,  7.9407e-02],
          [ 5.8008e-01,  1.6565e-01,  2.0581e-01,  ..., -8.3160e-03,
            3.1555e-02,  1.5640e-03],
          ...,
          [-1.6504e-01, -1.0706e-01,  6.7444e-03,  ..., -1.5823e-02,
            3.3173e-02, -5.1422e-03],
          [-1.7212e-02, -1.2427e-01,  4.0649e-02,  ...,  3.5461e-02,
            6.1401e-02,  3.2104e-02],
          [ 3.3057e-01,  1.3086e-01, -1.5552e-01,  ...,  3.5492e-02,
            6.1401e-02,  3.2104e-02]],

         [[-2.0203e-01, -1.9006e-01, -2.8854e-02,  ..., -2.7069e-02,
            