In [14]:
!pip install transformers datasets peft



In [109]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
import numpy as np
from peft import LoraConfig, TaskType, get_peft_model, IA3Config
# 加載數據集
dataset_name = "stanfordnlp/imdb"
imdb_dataset = load_dataset(dataset_name)

# 縮短文本以便於快速訓練
def truncate(example):
    return {
        'text': " ".join(example['text'].split()[:50]),
        'label': example['label']
    }

# 取出部分樣本進行訓練和驗證
small_imdb_dataset = DatasetDict(
    train=imdb_dataset['train'].shuffle(seed=1111).select(range(1024)).map(truncate),
    val=imdb_dataset['train'].shuffle(seed=1111).select(range(1024, 1280)).map(truncate),
)

# 準備數據集
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
small_tokenized_dataset = small_imdb_dataset.map(
    lambda example: tokenizer(example['text'], padding=True, truncation=True),
    batched=True,
    batch_size=16
)

small_tokenized_dataset = small_tokenized_dataset.remove_columns(["text"])
small_tokenized_dataset = small_tokenized_dataset.rename_column("label", "labels")
small_tokenized_dataset.set_format("torch")

train_dataloader = DataLoader(small_tokenized_dataset['train'], batch_size=16)
eval_dataloader = DataLoader(small_tokenized_dataset['val'], batch_size=16)


In [141]:
# 設置LoRA配置並添加到模型中
model_id = "distilbert-base-cased"
model = DistilBertForSequenceClassification.from_pretrained(model_id, num_labels=2)

lora_config1 = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"]
)

lora_config2 = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.2,
    r=128,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"]
)

lora_config3 = LoraConfig(
    lora_alpha=8,
    lora_dropout=0.15,
    r=32,
    bias="all",
    task_type=TaskType.SEQ_CLS,
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin", "ffn_lin1", "ffn_lin2"]
)

peft_config = lora_config3

model.add_adapter(peft_config, adapter_name="config")
model.enable_adapters()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [142]:
# 訓練參數設置
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
)

def compute_metrics(eval_pred):
    """Called at the end of validation. Gives accuracy"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # calculates the accuracy
    return {"accuracy": np.mean(predictions == labels)}

# 使用Trainer進行訓練
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_tokenized_dataset['train'],
    eval_dataset=small_tokenized_dataset['val'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# 評估模型
results = trainer.evaluate()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.699947,0.480469
2,No log,0.697876,0.480469
3,No log,0.696139,0.480469
4,No log,0.69464,0.480469
5,No log,0.693636,0.480469
6,No log,0.692693,0.480469
7,No log,0.691916,0.480469
8,No log,0.691321,0.488281
9,No log,0.690985,0.492188
10,No log,0.690868,0.492188


In [143]:
# 保存模型
model.save_pretrained("save_model")
tokenizer.save_pretrained("save_tokenizer")

('save_tokenizer\\tokenizer_config.json',
 'save_tokenizer\\special_tokens_map.json',
 'save_tokenizer\\vocab.txt',
 'save_tokenizer\\added_tokens.json',
 'save_tokenizer\\tokenizer.json')

In [144]:
# 測試保存的模型
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("save_model")
tokenizer = AutoTokenizer.from_pretrained("save_tokenizer")

test_str_pos = "you are excellent"
test_str_neg = "you are a loser"
model_inputs = tokenizer(test_str_pos, return_tensors="pt")
prediction = torch.argmax(model(**model_inputs).logits)
print(["NEGATIVE", "POSITIVE"][prediction])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NEGATIVE
