In [64]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np


In [65]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-0.6B"

id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto",
    id2label=id2label,
    label2id=label2id
)

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 如果tokenizer没有pad_token，设置tokenizer的pad_token为`[pad]`
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    # 调整模型词向量层的大小,使其匹配tokenizer的词汇表大小。
    # 当我们自定义添加了特殊词汇(如本例中的pad_token)到tokenizer后,这些词还没有对应的词向量。
    # 为了让模型可以正确处理这些词汇,需要扩大模型原始的词向量层,使其大小匹配tokenizer词汇表的大小。
    # resize_token_embeddings就是用来调整词向量层大小的方法。len(tokenizer)可以获取tokenizer的词汇表大小。
    model.resize_token_embeddings(len(tokenizer))


In [None]:
dataset = load_dataset("glue", "sst2")


Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 271634.92 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 278546.93 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 305754.09 examples/s]


In [79]:
def tokenize_function(examples):
    # 提取文本
    text = examples["sentence"]

    # 从左边截取，最大长度512
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    
    return tokenized_inputs


In [80]:
tokenized_dataset = dataset.map(tokenize_function, batched=True, batch_size=16)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]


KeyError: 'sentence'

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}


In [None]:
text_list = ["a feel-good picture in the best sense of the term .",  # positive
             "resourceful and ingenious entertainment .",   # positive
             "it 's just incredibly dull .",   # negative
             "the movie 's biggest offense is its complete and utter lack of tension .",  # negative
             "impresses you with its open-endedness and surprises .",   # positive
             "unless you are in dire need of a diesel fix , there is no real reason to see it ."  # negative
]
print("Untrained model generations:")
print("----------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(model.device)
    output = model.generate(inputs, max_new_tokens=10)
    generated = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"Input: {text}")
    print(f"Generated: {generated}")

Untrained model generations:
----------------------------
Input: a feel-good picture in the best sense of the term .
Generated: a feel-good picture in the best sense of the term . To describe a feeling that is positive and uplifting,
Input: resourceful and ingenious entertainment .
Generated: resourceful and ingenious entertainment . . . . . . . . . . .
Input: it 's just incredibly dull .
Generated: it 's just incredibly dull .', 'the problem is that i can't find
Input: the movie 's biggest offense is its complete and utter lack of tension .
Generated: the movie 's biggest offense is its complete and utter lack of tension . ' 

This is a movie that has a complete
Input: impresses you with its open-endedness and surprises .
Generated: impresses you with its open-endedness and surprises . . . . . . . . . . .
Input: unless you are in dire need of a diesel fix , there is no real reason to see it .
Generated: unless you are in dire need of a diesel fix , there is no real reason to see it . 

In [None]:
peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=4,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=['q_proj', 'v_proj']
)


In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


trainable params: 573,440 || all params: 596,623,360 || trainable%: 0.0961


In [None]:
lr = 1e-3
batch_size = 16
num_epochs = 5


In [None]:
training_args = TrainingArguments(
    output_dir= './checkpoints/automodel_finetune' + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=[]  # Disable wandb and other reporting integrations
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

ValueError: Expected input batch_size (368) to match target batch_size (16).