In [None]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
import mindspore
import datasets
from tqdm import tqdm
from mindnlp.transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np

from mindnlp.peft import AdaLoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
from mindspore.dataset import GeneratorDataset


os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = "cuda"
model_name_or_path = "facebook/bart-base"
tokenizer_name_or_path = "facebook/bart-base"

checkpoint_name = "financial_sentiment_analysis_lora_v1.pt"
text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-3
num_epochs = 8
batch_size = 8


In [None]:
# creating model
peft_config = AdaLoraConfig(
    init_r=12,
    target_r=8,
    beta1=0.85,
    beta2=0.85,
    tinit=200,
    tfinal=1000,
    deltaT=10,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)


In [None]:
def process_dataset(source,   batch_size=32, shuffle=False):

    column_names = ['input_ids', 'attention_mask','labels','text_labels']
    
    dataset = GeneratorDataset(source, column_names=column_names, shuffle=shuffle)
    # transforms

    dataset = dataset.batch(batch_size)

    return dataset

class MSDataset:
    def __init__(self, filepath,tokenizer,max_length):
        self.path = filepath
        self.sentences = []
        self.labels = []
        self.text_labels = []
        self._load()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def _load(self):
        label_mapping = {
            "negative": 0,
            "neutral": 1,
            "positive": 2
        }
        with open(self.path, encoding="iso-8859-1") as f:
            for line in f:
                sentence, label_text = line.strip().split("@")
                self.sentences.append(sentence)
                self.labels.append(label_mapping[label_text])
                self.text_labels.append(label_text)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        text_labels = self.text_labels[index]
        model_inputs = self.tokenizer(sentence, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="np")
        labels = self.tokenizer(text_labels, max_length=3, padding="max_length", truncation=True, return_tensors="np")
        labels = labels["input_ids"]
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return model_inputs['input_ids'], model_inputs['attention_mask'], labels,self.text_labels[index]

    def __len__(self):
        return len(self.sentences)
    
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
dataset = process_dataset(MSDataset("/tmp/code/dataset/data/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt",tokenizer,max_length),batch_size=batch_size)

train_dataset, eval_dataset = dataset.split([0.9, 0.1])



In [None]:
optimizer = mindspore.nn.AdamWeightDecay(model.trainable_params(), learning_rate=lr)


In [None]:
print(next(train_dataset.create_dict_iterator()))

In [None]:
model.base_model.peft_config["default"].total_step = len(train_dataloader) * num_epochs


In [None]:
from mindspore import Tensor

num_batches = len(train_dataset)
num_batches_eval = len(eval_dataset)
                       
def forward_fn(input_ids,attention_mask,labels ):
        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        return output.loss, output.logits
    
grad_fn = mindspore.value_and_grad(
        forward_fn, None,optimizer.parameters, has_aux=True
    )
global_step = 0
for epoch in range(num_epochs):
    model.set_train(True)
    total_loss, total_step = 0, 0
    correct = 0
    total = 0
    with tqdm(total=num_batches) as t:
        for step, (input_ids,attention_mask,labels,_) in enumerate(train_dataset):
            input_ids  = input_ids.squeeze(axis=1)
            labels  = labels.squeeze(axis=1)
            attention_mask = attention_mask.squeeze(axis=1)
            (loss, logits), grad = grad_fn(input_ids,attention_mask,labels)
            optimizer(grad)
            model.base_model.update_and_allocate(global_step, grad)
            total_loss += loss.asnumpy()
            total_step += 1
            global_step += 1
            curr_loss = total_loss / total_step
            t.set_postfix({'train-loss': f'{curr_loss:.2f}'})
            t.update(1)
    model.set_train(False)
    eval_loss = 0
    total_step = 0
    eval_preds = []
    text_labels = []
    with tqdm(total=num_batches_eval) as t:
        for step, (input_ids,attention_mask,labels,text) in enumerate(eval_dataset):
            input_ids = input_ids.squeeze(axis=1)
            labels = labels.squeeze(axis=1)
            attention_mask = attention_mask.squeeze(axis=1)
            outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
            loss = outputs.loss
            eval_loss += loss.asnumpy()
            total_step += 1           
            eval_loss = total_loss / total_step
            eval_preds.extend(
                tokenizer.batch_decode(np.argmax(outputs.logits.asnumpy(), -1), skip_special_tokens=True)
            )
            text_str = str(text.asnumpy())
            text_str = text_str.replace("[", "").replace("]", "").replace(",", "").replace("'", "")
            labels = text_str.split(" ")
            text_labels.extend(labels)
            t.set_postfix({'eval-loss': f'{eval_loss:.2f}'})
            t.update(1)
    for pred, text_label in zip(eval_preds, text_labels):
        if pred.strip() == text_label.strip():
            correct += 1
        total += 1
    accuracy = correct / total * 100
    print(f"{accuracy=} % on the evaluation dataset")
    eval_epoch_loss = eval_loss / eval_dataset.get_dataset_size()
    eval_ppl = np.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / train_dataset.get_dataset_size()
    train_ppl = np.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

In [None]:
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)

In [None]:
ckpt = f"{peft_model_id}/adapter_model.bin"


In [None]:
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
i = 13
def load_dataset(filepath):
    sentences = []
    labels = []
    with open(filepath, encoding="iso-8859-1") as f:
        for line in f:
            sentence, label = line.strip().split("@")
            sentences.append(sentence)
            labels.append(label)
    
    dataset = datasets.Dataset.from_dict({
        "sentence": sentences,
        "label": labels,
    })
    return dataset

dataset = load_dataset("/tmp/code/dataset/data/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt")
dataset = dataset.train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

inputs = tokenizer(dataset["validation"][text_column][i], return_tensors="ms")
print(dataset["validation"][text_column][i])
print(inputs)

outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
print(outputs)
print(tokenizer.batch_decode(outputs ,skip_special_tokens=True))