In [1]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import mindspore
from tqdm import tqdm
from mindnlp.transformers import AutoModelForSeq2SeqLM,AutoTokenizer
import numpy as np

from mindnlp.peft import AdaLoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
from mindspore.dataset import GeneratorDataset
from mindnlp.common.optimization import get_linear_schedule_with_warmup
from mindnlp.core import value_and_grad
if "RANK_TABLE_FILE" in os.environ:
    del os.environ["RANL_TABLE_FILE"]


os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_name_or_path = "facebook/bart-base"
checkpoint_name = "financial_sentiment_analysis_lora_v1.pt"
text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-3
num_epochs = 8
batch_size = 8


  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  from .autonotebook import tqdm as notebook_tqdm
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.297 seconds.
Prefix dict has been built successfully.


In [2]:
# creating model
peft_config = AdaLoraConfig(
    init_r=12,
    target_r=8,
    beta1=0.85,
    beta2=0.85,
    tinit=200,
    tfinal=1000,
    deltaT=10,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
)

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

BartForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`.`PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


[MS_ALLOC_CONF]Runtime config:  enable_vmm:True  vmm_align_size:2MB
trainable params: 2,434,176 || all params: 141,854,688 || trainable%: 1.715964438200308


In [4]:
# download dataset
from mindnlp.dataset import load_dataset

In [5]:
#加载数据集
from mindnlp.transformers import AutoTokenizer
mindspore.dataset.config.set_seed(123)
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
classes = dataset.source.ds.features["label"].names
classes

['negative', 'neutral', 'positive']

In [6]:
train_dataset, validation_dataset = dataset.shuffle(64).split([0.9, 0.1])



In [7]:
def add_text_label(sentence, label):
    return sentence, label, classes[label.item()]

train_dataset = train_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])
validation_dataset = validation_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])

In [8]:
next(train_dataset.create_dict_iterator())

{'sentence': Tensor(shape=[], dtype=String, value= 'The gross area of the Innova 2 project will be about 10,000 sq m ( 107,600 sq ft ) .'),
 'label': Tensor(shape=[], dtype=Int64, value= 1),
 'text_label': Tensor(shape=[], dtype=String, value= 'neutral')}

In [9]:
#分词化
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)



In [10]:
import numpy as np
from mindnlp.dataset import BaseMapFunction
from threading import Lock
lock = Lock()

class MapFunc(BaseMapFunction):
    def __call__(self, sentence, label, text_label):
        lock.acquire()
        model_inputs = tokenizer(sentence, max_length=max_length, padding="max_length", truncation=True)
        labels = tokenizer(text_label, max_length=3, padding="max_length", truncation=True)
        lock.release()
        labels = labels['input_ids']
        labels = np.where(np.equal(labels, tokenizer.pad_token_id), -100, labels)
        return model_inputs['input_ids'], model_inputs['attention_mask'], labels


def get_dataset(dataset, tokenizer, shuffle=True):
    input_colums=['sentence', 'label', 'text_label']
    output_columns=['input_ids', 'attention_mask', 'labels']
    dataset = dataset.map(MapFunc(input_colums, output_columns),
                          input_colums, output_columns)
    if shuffle:
        dataset = dataset.shuffle(64)
    dataset = dataset.batch(batch_size)
    return dataset

train_dataset = get_dataset(train_dataset, tokenizer)
eval_dataset = get_dataset(validation_dataset, tokenizer, shuffle=False)

In [11]:
next(train_dataset.create_dict_iterator())

{'input_ids': Tensor(shape=[8, 128], dtype=Int64, value=
 [[    0,   133,  4200 ...     1,     1,     1],
  [    0, 20839,    42 ...     1,     1,     1],
  [    0,   133,  2771 ...     1,     1,     1],
  ...
  [    0,   487, 17202 ...     1,     1,     1],
  [    0, 37591,  1633 ...     1,     1,     1],
  [    0,   133,  4939 ...     1,     1,     1]]),
 'attention_mask': Tensor(shape=[8, 128], dtype=Int64, value=
 [[1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0],
  ...
  [1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0]]),
 'labels': Tensor(shape=[8, 3], dtype=Int64, value=
 [[    0, 12516,     2],
  [    0, 12516,     2],
  [    0, 12516,     2],
  ...
  [    0, 12516,     2],
  [    0, 22173,     2],
  [    0, 12516,     2]])}

In [12]:
# from mindnlp.build.lib.mindnlp.common.optimization import get_linear_schedule_with_warmup
from mindnlp.core import optim

# Setting up optimizer and learning rate scheduler
optimizer = optim.AdamW(model.trainable_params(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_dataset) * num_epochs))

In [13]:
# model.base_model.peft_config["default"].total_step = len(train_dataset) * num_epochs
# model

In [14]:
# Forward function to compute the loss
# from Model.mindnlp.build.lib.mindnlp.core.autograd.function import value_and_grad
def forward_fn(**batch):
    outputs = model(
        **batch
    )
    loss = outputs.loss
    return loss

# Gradient function to compute gradients for optimization
grad_fn = value_and_grad(forward_fn, model.trainable_params(),has_aux=False,attach_grads=True)
# Define the training step function#

In [15]:
from mindspore import ops
global_step = 0
for epoch in range(num_epochs):
    model.set_train(True)
    total_loss = 0
    train_total_size = train_dataset.get_dataset_size()
    # Iterate over each entry in the training dataset
    for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):
        optimizer.zero_grad()
        loss = grad_fn(**batch)
        optimizer.step()
        total_loss += loss.float()
        lr_scheduler.step()
        # model.base_model.update_and_allocate(global_step)
        
        global_step += 1
    model.set_train(False)
    eval_loss = 0
    eval_preds = []
    eval_total_size = eval_dataset.get_dataset_size()
    # Iterate over each entry in the evaluation dataset
    for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)): 
        with mindspore._no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.float()
        eval_preds.extend(
            tokenizer.batch_decode(ops.argmax(outputs.logits, -1).asnumpy(), skip_special_tokens=True)
        )
    eval_epoch_loss = eval_loss / len(eval_dataset)
    eval_ppl = ops.exp(eval_epoch_loss) # Perplexity
    train_epoch_loss = total_loss / len(train_dataset)
    train_ppl = ops.exp(train_epoch_loss) # Perplexity
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/255 [00:00<?, ?it/s]

-

100%|██████████| 255/255 [02:48<00:00,  1.51it/s]
  7%|▋         | 2/29 [00:01<00:17,  1.52it/s]

\

100%|██████████| 29/29 [00:06<00:00,  4.76it/s]


epoch=0: train_ppl=Tensor(shape=[], dtype=Float32, value= 3.05763) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 1.11764) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.15011) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.139858)


100%|██████████| 255/255 [02:50<00:00,  1.49it/s]
100%|██████████| 29/29 [00:07<00:00,  3.82it/s]


epoch=1: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.14799) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.138016) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.05501) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0535532)


100%|██████████| 255/255 [03:17<00:00,  1.29it/s]
100%|██████████| 29/29 [00:06<00:00,  4.17it/s]


epoch=2: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.1036) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0985806) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.04416) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0432154)


100%|██████████| 255/255 [02:58<00:00,  1.43it/s]
100%|██████████| 29/29 [00:07<00:00,  4.00it/s]


epoch=3: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.18729) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.171672) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.22589) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.203668)


100%|██████████| 255/255 [03:10<00:00,  1.34it/s]
100%|██████████| 29/29 [00:07<00:00,  3.75it/s]


epoch=4: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.20757) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.188607) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.12162) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.114774)


100%|██████████| 255/255 [03:17<00:00,  1.29it/s]
100%|██████████| 29/29 [00:07<00:00,  3.83it/s]


epoch=5: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.12998) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.122197) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.09629) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.091934)


100%|██████████| 255/255 [03:09<00:00,  1.35it/s]
100%|██████████| 29/29 [00:07<00:00,  3.98it/s]


epoch=6: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.09778) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.093289) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.08209) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0788912)


100%|██████████| 255/255 [03:18<00:00,  1.29it/s]
100%|██████████| 29/29 [00:07<00:00,  3.93it/s]

epoch=7: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.09743) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0929676) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.08905) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0853012)





In [16]:
# Initialize counters for correct predictions and total predictions
correct = 0
total = 0

# List to store actual labels for comparison
ground_truth = []

# Compare each predicted label with the true label
for pred, data in zip(eval_preds, validation_dataset.create_dict_iterator(output_numpy=True)):
    true = str(data['text_label'])
    ground_truth.append(true)
    if pred.strip() == true.strip():
        correct += 1
    total += 1

# Calculate the percentage of correct predictions
accuracy = correct / total * 100

# Output the accuracy and sample predictions for review
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{ground_truth[:10]=}")

accuracy=97.34513274336283 % on the evaluation dataset
eval_preds[:10]=['neutral', 'neutral', 'neutral', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'positive', 'positive']
ground_truth[:10]=['neutral', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'positive', 'positive', 'positive']


In [17]:
# Save the model
peft_model_id = f"../../output/{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)

In [18]:
from mindnlp.transformers import AutoModelForSeq2SeqLM
from mindnlp.peft import PeftModel, PeftConfig

peft_model_id = f"../../output/{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"

# Load the model configuration
config = PeftConfig.from_pretrained(peft_model_id)

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)

# Load the pretrained adapter
model = PeftModel.from_pretrained(model, peft_model_id)

In [19]:
# Retrieve an entry from the validation dataset.
# example = next(validation_dataset.create_dict_iterator(output_numpy=True)) # Get an example entry from the validation dataset
# print(example['sentence'])
# print(example['text_label'])

# Alternatively, create your own text
example = {'sentence': 'Nvidia Tops $3 Trillion in Market Value, Leapfrogging Apple.'}

inputs = tokenizer(example['sentence'], return_tensors="ms") # Get the tokenized text label
print(inputs)

model.set_train(False)
with mindspore._no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10) # Predict the text label using the trained model
    print(outputs)
    print(tokenizer.batch_decode(outputs.asnumpy(), skip_special_tokens=True)) # Print decoded text label from the prediction

{'input_ids': Tensor(shape=[1, 20], dtype=Int64, value=
[[    0,   487, 47435 ...  1257,     4,     2]]), 'attention_mask': Tensor(shape=[1, 20], dtype=Int64, value=
[[1, 1, 1 ... 1, 1, 1]])}
[[    2     0 22173     2]]
['positive']
