In [1]:
import os
import mindspore
from tqdm import tqdm
from mindnlp.transformers import BartTokenizer, AutoModelForSeq2SeqLM
import numpy as np

from mindnlp.peft import AdaLoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
from mindspore.dataset import GeneratorDataset


os.environ["TOKENIZERS_PARALLELISM"] = "false"
model_name_or_path = "facebook/bart-base"
tokenizer_name_or_path = "facebook/bart-base"

checkpoint_name = "financial_sentiment_analysis_lora_v1.pt"
text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-3
num_epochs = 8
batch_size = 8


  from .autonotebook import tqdm as notebook_tqdm
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.622 seconds.
Prefix dict has been built successfully.


In [2]:
# creating model
peft_config = AdaLoraConfig(
    init_r=12,
    target_r=8,
    beta1=0.85,
    beta2=0.85,
    tinit=200,
    tfinal=1000,
    deltaT=10,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
)


In [3]:

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()



trainable params: 2,434,176 || all params: 141,854,688 || trainable%: 1.715964438200308


In [4]:
# download dataset
!wget https://hf-mirror.com/datasets/financial_phrasebank/resolve/main/data/FinancialPhraseBank-v1.0.zip
!unzip FinancialPhraseBank-v1.0.zip

--2024-04-26 15:55:22--  https://hf-mirror.com/datasets/financial_phrasebank/resolve/main/data/FinancialPhraseBank-v1.0.zip
Resolving hf-mirror.com (hf-mirror.com)... 153.121.57.40, 160.16.199.204, 133.242.169.68
Connecting to hf-mirror.com (hf-mirror.com)|153.121.57.40|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf-mirror.com/datasets/financial_phrasebank/0e1a06c4900fdae46091d031068601e3773ba067c7cecb5b0da1dcba5ce989a6?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27FinancialPhraseBank-v1.0.zip%3B+filename%3D%22FinancialPhraseBank-v1.0.zip%22%3B&response-content-type=application%2Fzip&Expires=1714376978&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcxNDM3Njk3OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9kYXRhc2V0cy9maW5hbmNpYWxfcGhyYXNlYmFuay8wZTFhMDZjNDkwMGZkYWU0NjA5MWQwMzEwNjg2MDFlMzc3M2JhMDY3YzdjZWNiNWIwZGExZGNiYTVjZTk4OWE2P3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9z

In [5]:
def process_dataset(source,   batch_size=32, shuffle=False):

    column_names = ['input_ids', 'attention_mask','labels','text_labels']
    
    dataset = GeneratorDataset(source, column_names=column_names, shuffle=shuffle)
    # transforms

    dataset = dataset.batch(batch_size)

    return dataset

class MSDataset:
    def __init__(self, filepath,tokenizer,max_length):
        self.path = filepath
        self.sentences = []
        self.labels = []
        self.text_labels = []
        self._load()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def _load(self):
        label_mapping = {
            "negative": 0,
            "neutral": 1,
            "positive": 2
        }
        with open(self.path, encoding="iso-8859-1") as f:
            for line in f:
                sentence, label_text = line.strip().split("@")
                self.sentences.append(sentence)
                self.labels.append(label_mapping[label_text])
                self.text_labels.append(label_text)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        text_labels = self.text_labels[index]
        model_inputs = self.tokenizer(sentence, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="np")
        labels = self.tokenizer(text_labels, max_length=3, padding="max_length", truncation=True, return_tensors="np")
        labels = labels["input_ids"]
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return model_inputs['input_ids'], model_inputs['attention_mask'], labels,self.text_labels[index]

    def __len__(self):
        return len(self.sentences)
    
tokenizer = BartTokenizer.from_pretrained(model_name_or_path)
dataset = process_dataset(MSDataset("./FinancialPhraseBank-v1.0/Sentences_AllAgree.txt",tokenizer,max_length),batch_size=batch_size)

train_dataset, eval_dataset = dataset.split([0.9, 0.1])



In [6]:
optimizer = mindspore.nn.AdamWeightDecay(model.trainable_params(), learning_rate=lr)


In [7]:
print(next(train_dataset.create_dict_iterator()))

{'input_ids': Tensor(shape=[8, 1, 128], dtype=Int64, value=
[[[    0,   597,  5246 ...     1,     1,     1]],
 [[    0,   597,  5246 ...     1,     1,     1]],
 [[    0,  1121,   644 ...     1,     1,     1]],
 ...
 [[    0, 20420,  1295 ...     1,     1,     1]],
 [[    0, 20420,  1295 ...     1,     1,     1]],
 [[    0, 20420,  1295 ...     1,     1,     1]]]), 'attention_mask': Tensor(shape=[8, 1, 128], dtype=Int64, value=
[[[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]],
 ...
 [[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]]]), 'labels': Tensor(shape=[8, 1, 3], dtype=Int64, value=
[[[    0, 33407,     2]],
 [[    0, 33407,     2]],
 [[    0, 33407,     2]],
 ...
 [[    0, 33407,     2]],
 [[    0, 33407,     2]],
 [[    0, 33407,     2]]]), 'text_labels': Tensor(shape=[8], dtype=String, value= ['negative', 'negative', 'negative', 'negative', 'negative', 'negative',
 'negative', 'negative'])}


In [8]:
model.base_model.peft_config["default"].total_step = len(train_dataset) * num_epochs
from mindspore import Tensor

num_batches = len(train_dataset)
num_batches_eval = len(eval_dataset)
                       
def forward_fn(input_ids,attention_mask,labels ):
        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        return output.loss, output.logits
    
grad_fn = mindspore.value_and_grad(
        forward_fn, None,model.trainable_params(), has_aux=True,return_ids=True
    )
global_step = 0
for epoch in range(num_epochs):
    model.set_train(True)
    total_loss, total_step = 0, 0
    correct = 0
    total = 0
    with tqdm(total=num_batches) as t:
        for step, (input_ids,attention_mask,labels,_) in enumerate(train_dataset):
            input_ids  = input_ids.squeeze(axis=1)
            labels  = labels.squeeze(axis=1)
            attention_mask = attention_mask.squeeze(axis=1)
            (loss, logits), grad = grad_fn(input_ids,attention_mask,labels)
            gradient = [g for _, g in grad]
            gradient = tuple(gradient)
            optimizer(gradient)
            model.base_model.update_and_allocate(global_step, grad)
            total_loss += loss.asnumpy()
            total_step += 1
            global_step += 1
            curr_loss = total_loss / total_step
            t.set_postfix({'train-loss': f'{curr_loss:.2f}'})
            t.update(1)
    model.set_train(False)
    eval_loss = 0
    total_step = 0
    eval_preds = []
    text_labels = []
    with tqdm(total=num_batches_eval) as t:
        for step, (input_ids,attention_mask,labels,text) in enumerate(eval_dataset):
            input_ids = input_ids.squeeze(axis=1)
            labels = labels.squeeze(axis=1)
            attention_mask = attention_mask.squeeze(axis=1)
            outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
            loss = outputs.loss
            eval_loss += loss.asnumpy()
            total_step += 1           
            eval_loss = total_loss / total_step
            eval_preds.extend(
                tokenizer.batch_decode(np.argmax(outputs.logits.asnumpy(), -1), skip_special_tokens=True)
            )
            text_str = str(text.asnumpy())
            text_str = text_str.replace("[", "").replace("]", "").replace(",", "").replace("'", "")
            labels = text_str.split(" ")
            text_labels.extend(labels)
            t.set_postfix({'eval-loss': f'{eval_loss:.2f}'})
            t.update(1)
    for pred, text_label in zip(eval_preds, text_labels):
        if pred.strip() == text_label.strip():
            correct += 1
        total += 1
    accuracy = correct / total * 100
    print(f"{accuracy=} % on the evaluation dataset")
    eval_epoch_loss = eval_loss / eval_dataset.get_dataset_size()
    eval_ppl = np.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / train_dataset.get_dataset_size()
    train_ppl = np.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 255/255 [02:26<00:00,  1.74it/s, train-loss=0.62]
100%|██████████| 28/28 [00:05<00:00,  5.38it/s, eval-loss=5.66] 


accuracy=93.75 % on the evaluation dataset
epoch=0: train_ppl=1.8609879621920542 train_epoch_loss=0.6211075091771051 eval_ppl=1.2238705103251357 eval_epoch_loss=0.20201838627571658


100%|██████████| 255/255 [02:25<00:00,  1.75it/s, train-loss=0.13]
100%|██████████| 28/28 [00:05<00:00,  4.79it/s, eval-loss=1.16]


accuracy=97.32142857142857 % on the evaluation dataset
epoch=1: train_ppl=1.1363381974778828 train_epoch_loss=0.12781098503984656 eval_ppl=1.042447355381869 eval_epoch_loss=0.041571174981072545


100%|██████████| 255/255 [02:24<00:00,  1.76it/s, train-loss=0.09]
100%|██████████| 28/28 [00:05<00:00,  5.36it/s, eval-loss=0.86]


accuracy=98.21428571428571 % on the evaluation dataset
epoch=2: train_ppl=1.0987274775015643 train_epoch_loss=0.09415267151506508 eval_ppl=1.031097363727399 eval_epoch_loss=0.030623636781047955


100%|██████████| 255/255 [02:24<00:00,  1.77it/s, train-loss=0.07]
100%|██████████| 28/28 [00:05<00:00,  4.83it/s, eval-loss=0.67]


accuracy=98.21428571428571 % on the evaluation dataset
epoch=3: train_ppl=1.0760787627259034 train_epoch_loss=0.0733236586200256 eval_ppl=1.0241355532391463 eval_epoch_loss=0.023848894066462407


100%|██████████| 255/255 [02:35<00:00,  1.64it/s, train-loss=0.07]
100%|██████████| 28/28 [00:05<00:00,  5.00it/s, eval-loss=0.60]


accuracy=99.10714285714286 % on the evaluation dataset
epoch=4: train_ppl=1.06777834246325 train_epoch_loss=0.06558017448759547 eval_ppl=1.0215594030444257 eval_epoch_loss=0.0213302863448174


100%|██████████| 255/255 [02:32<00:00,  1.67it/s, train-loss=0.08]
100%|██████████| 28/28 [00:05<00:00,  4.97it/s, eval-loss=0.74]


accuracy=94.19642857142857 % on the evaluation dataset
epoch=5: train_ppl=1.0851041400075039 train_epoch_loss=0.08167596396022275 eval_ppl=1.0269215330557264 eval_epoch_loss=0.026565523992164285


100%|██████████| 255/255 [02:37<00:00,  1.62it/s, train-loss=0.04]
100%|██████████| 28/28 [00:05<00:00,  5.27it/s, eval-loss=0.32]


accuracy=97.32142857142857 % on the evaluation dataset
epoch=6: train_ppl=1.0361444693679083 train_epoch_loss=0.03550658331197851 eval_ppl=1.0116156410396517 eval_epoch_loss=0.011548697378258315


100%|██████████| 255/255 [02:34<00:00,  1.65it/s, train-loss=0.06]
100%|██████████| 28/28 [00:04<00:00,  5.60it/s, eval-loss=0.55]

accuracy=97.32142857142857 % on the evaluation dataset
epoch=7: train_ppl=1.0618449770364131 train_epoch_loss=0.06000793950595692 eval_ppl=1.0197096077598435 eval_epoch_loss=0.019517888487269153





In [9]:
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)

In [10]:
ckpt = f"{peft_model_id}/adapter_model.ckpt"


In [11]:
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"

new_config = PeftConfig.from_pretrained(peft_model_id)
new_model = AutoModelForSeq2SeqLM.from_pretrained(new_config.base_model_name_or_path)
new_model = PeftModel.from_pretrained(new_model, peft_model_id)



In [13]:
eval_preds = []
text_labels = []
for step, (input_ids,attention_mask,labels,text) in enumerate(eval_dataset):
            input_ids = input_ids.squeeze(axis=1)
            labels = labels.squeeze(axis=1)
            attention_mask = attention_mask.squeeze(axis=1)
            outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
            eval_preds.extend(
                tokenizer.batch_decode(np.argmax(outputs.logits.asnumpy(), -1), skip_special_tokens=True)
            )
            text_str = str(text.asnumpy())
            text_str = text_str.replace("[", "").replace("]", "").replace(",", "").replace("'", "").replace("\n", "")
            labels = text_str.split(" ")
            text_labels.extend(labels)
for pred, text_label in zip(eval_preds, text_labels):
    print(f"{pred=} {text_label=}")

pred='neutral' text_label='neutral'
pred='neutral' text_label='neutral'
pred='neutral' text_label='neutral'
pred='neutral' text_label='neutral'
pred='neutral' text_label='neutral'
pred='neutral' text_label='neutral'
pred='neutral' text_label='neutral'
pred='neutral' text_label='neutral'
pred='negative' text_label='negative'
pred='negative' text_label='negative'
pred='negative' text_label='negative'
pred='negative' text_label='negative'
pred='negative' text_label='negative'
pred='negative' text_label='negative'
pred='negative' text_label='negative'
pred='negative' text_label='negative'
pred='positive' text_label='positive'
pred='positive' text_label='positive'
pred='positive' text_label='positive'
pred='positive' text_label='positive'
pred='positive' text_label='positive'
pred='negative' text_label='negative'
pred='positive' text_label='positive'
pred='positive' text_label='positive'
pred='neutral' text_label='neutral'
pred='neutral' text_label='neutral'
pred='neutral' text_label='neutr