In [None]:
"""peft_ln_tuning_clm.py"""

import os

import mindspore

from mindnlp.dataset import load_dataset
from mindnlp.engine import Trainer, TrainingArguments
from mindnlp.peft import LNTuningConfig, PeftModel, TaskType, get_peft_model
from mindnlp.transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
mindspore.set_context(device_target="GPU")
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
# os.environ["HTTP_PROXY"] = "http://127.0.0.1:7890"
# os.environ["HTTPS_PROXY"] = "http://127.0.0.1:7890"

In [None]:
model_name = "bigscience/bloomz-560m"
dataset_name = "ought/raft"
field = "twitter_complaints"
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 1e-2
num_epochs = 8
batch_size = 12
classes = ["Unlabeled", "complaint", "no complaint"]

In [None]:
peft_config = LNTuningConfig(task_type=TaskType.CAUSAL_LM)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# ['Tweet text', 'ID', 'Label']
dataset = load_dataset(dataset_name, field)

In [None]:
def preprocess_fn(texts, ids, labels):
    input_texts = f"{text_column} : {texts} Label : {labels} :"
    targets = str(labels)
    model_inputs = tokenizer(
        input_texts, max_length=max_length, padding="max_length", truncation=True
    )
    labels = tokenizer(
        targets,
        add_special_tokens=False,
        max_length=max_length,
        padding="max_length",
        truncation=True,
    )
    return (
        model_inputs["input_ids"],
        model_inputs["attention_mask"],
        labels["input_ids"],
    )


train_set = dataset["train"].map(
    preprocess_fn,
    input_columns=["Tweet text", "ID", "Label"],
    output_columns=["input_ids", "attenion_mask", "labels"],
)
test_set = dataset["test"].map(
    preprocess_fn,
    input_columns=["Tweet text", "ID", "Label"],
    output_columns=["input_ids", "attenion_mask", "labels"],
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
training_args = TrainingArguments(
    output_dir="./output",
    num_train_epochs=num_epochs,
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    train_dataset=train_set,
    eval_dataset=test_set,
    args=training_args,
)

trainer.train()
model.save_pretrained(f"{model_name}_{peft_config.peft_type}_{peft_config.task_type}")

In [None]:
ckpt = f"{model_name}_{peft_config.peft_type}_{peft_config.task_type}.ckpt"
new_model = AutoModelForCausalLM.from_pretrained(model_name)
new_model = PeftModel.from_pretrained(new_model, ckpt[:-5])


text = "@greateranglia Ok thanks..."
out = tokenizer(
    text,
    max_length=max_length,
    padding="max_length",
    truncation=True,
    return_tensors="ms",
)
input_ids = out["input_ids"]
attention_mask = out["attention_mask"]
outputs = new_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=10,
    eos_token_id=3,
)
print(outputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))