In [1]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
from mindnlp.peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoKrConfig, TaskType
import mindspore
import datasets
from mindnlp.transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import numpy as np
from mindspore.dataset import NumpySlicesDataset, SequentialSampler
from mindspore.dataset import text, GeneratorDataset, transforms

model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"

text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-4
num_epochs = 3
batch_size = 6

  from .autonotebook import tqdm as notebook_tqdm
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.489 seconds.
Prefix dict has been built successfully.


In [2]:
# creating model
peft_config = LoKrConfig(task_type=TaskType.SEQ_2_SEQ_LM, target_modules=["q", "v"],inference_mode=False,r=8,lora_alpha=32,rank_dropout=0.0,module_dropout=0.0,use_effective_conv2d=True,init_weights=True,)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)

In [3]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 221,184 || all params: 1,229,802,496 || trainable%: 0.017985326970746365


In [4]:
def process_dataset(source,   batch_size=32, shuffle=False):

    column_names = ['input_ids', 'attention_mask','labels','text_labels']
    
    dataset = GeneratorDataset(source, column_names=column_names, shuffle=shuffle)
    # transforms

    dataset = dataset.batch(batch_size)

    return dataset

class MSDataset:
    def __init__(self, filepath,tokenizer,max_length):
        self.path = filepath
        self.sentences = []
        self.labels = []
        self.text_labels = []
        self._load()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def _load(self):
        label_mapping = {
            "negative": 0,
            "neutral": 1,
            "positive": 2
        }
        with open(self.path, encoding="iso-8859-1") as f:
            for line in f:
                sentence, label_text = line.strip().split("@")
                self.sentences.append(sentence)
                self.labels.append(label_mapping[label_text])
                self.text_labels.append(label_text)

    def __getitem__(self, index):
        sentence = self.sentences[index]
        text_labels = self.text_labels[index]
        model_inputs = self.tokenizer(sentence, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="np")
        labels = self.tokenizer(text_labels, max_length=3, padding="max_length", truncation=True, return_tensors="np")
        labels = labels["input_ids"]
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return model_inputs['input_ids'], model_inputs['attention_mask'], labels,self.text_labels[index]

    def __len__(self):
        return len(self.sentences)
    
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
dataset = process_dataset(MSDataset("/root/test/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt",tokenizer,max_length),batch_size=batch_size)

train_dataset, eval_dataset = dataset.split([0.9, 0.1])



In [5]:
# optimizer and lr scheduler
optimizer = mindspore.nn.AdamWeightDecay(model.trainable_params(), learning_rate=lr)


In [6]:
print(next(train_dataset.create_dict_iterator()))

{'input_ids': Tensor(shape=[6, 1, 128], dtype=Int64, value=
[[[ 2894,   287,  5835 ...     0,     0,     0]],
 [[  259, 23846,   263 ...     0,     0,     0]],
 [[11055, 20672,   287 ...     0,     0,     0]],
 [[ 7468, 19934,  1532 ...     0,     0,     0]],
 [[ 7468, 19934,  1532 ...     0,     0,     0]],
 [[12781,  1503,   329 ...     0,     0,     0]]]), 'attention_mask': Tensor(shape=[6, 1, 128], dtype=Int64, value=
[[[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]],
 [[1, 1, 1 ... 0, 0, 0]]]), 'labels': Tensor(shape=[6, 1, 3], dtype=Int64, value=
[[[59006,     1,  -100]],
 [[59006,     1,  -100]],
 [[59006,     1,  -100]],
 [[59006,     1,  -100]],
 [[59006,     1,  -100]],
 [[59006,     1,  -100]]]), 'text_labels': Tensor(shape=[6], dtype=String, value= ['neutral', 'neutral', 'neutral', 'neutral', 'neutral', 'neutral'])}


In [7]:
# training and evaluation
from mindspore import Tensor

num_batches = len(train_dataset)
num_batches_eval = len(eval_dataset)
                       
def forward_fn(input_ids,attention_mask,labels ):
        output = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        return output.loss, output.logits
    
grad_fn = mindspore.value_and_grad(
        forward_fn, None,optimizer.parameters, has_aux=True
    )
for epoch in range(num_epochs):
    model.set_train(True)
    total_loss, total_step = 0, 0
    correct = 0
    total = 0
    with tqdm(total=num_batches) as t:
        for step, (input_ids,attention_mask,labels,_) in enumerate(train_dataset):
            input_ids  = input_ids.squeeze(axis=1)
            labels  = labels.squeeze(axis=1)
            attention_mask = attention_mask.squeeze(axis=1)
            (loss, logits), grad = grad_fn(input_ids,attention_mask,labels)
            optimizer(grad)
            total_loss += loss.asnumpy()
            total_step += 1
            curr_loss = total_loss / total_step
            t.set_postfix({'train-loss': f'{curr_loss:.2f}'})
            t.update(1)
    model.set_train(False)
    eval_loss = 0
    total_step = 0
    eval_preds = []
    text_labels = []
    with tqdm(total=num_batches_eval) as t:
        for step, (input_ids,attention_mask,labels,text) in enumerate(eval_dataset):
            input_ids = input_ids.squeeze(axis=1)
            labels = labels.squeeze(axis=1)
            attention_mask = attention_mask.squeeze(axis=1)
            outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels=labels)
            loss = outputs.loss
            eval_loss += loss.asnumpy()
            total_step += 1           
            eval_loss = total_loss / total_step
            eval_preds.extend(
                tokenizer.batch_decode(np.argmax(outputs.logits.asnumpy(), -1), skip_special_tokens=True)
            )
            text_str = str(text.asnumpy())
            text_str = text_str.replace("[", "").replace("]", "").replace(",", "").replace("'", "")
            labels = text_str.split(" ")
            text_labels.extend(labels)
            t.set_postfix({'eval-loss': f'{eval_loss:.2f}'})
            t.update(1)
    for pred, text_label in zip(eval_preds, text_labels):
        if pred.strip() == text_label.strip():
            correct += 1
        total += 1
    accuracy = correct / total * 100
    print(f"{accuracy=} % on the evaluation dataset")
    eval_epoch_loss = eval_loss / eval_dataset.get_dataset_size()
    eval_ppl = np.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / train_dataset.get_dataset_size()
    train_ppl = np.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 340/340 [09:28<00:00,  1.67s/it, train-loss=3.79]
100%|██████████| 38/38 [00:20<00:00,  1.82it/s, eval-loss=33.95] 


accuracy=26.785714285714285 % on the evaluation dataset
epoch=0: train_ppl=44.457796177776416 train_epoch_loss=3.7945403386564815 eval_ppl=2.443548598151249 eval_epoch_loss=0.8934513262764568


100%|██████████| 340/340 [09:42<00:00,  1.71s/it, train-loss=0.57]
100%|██████████| 38/38 [00:19<00:00,  1.95it/s, eval-loss=5.08] 


accuracy=81.14035087719299 % on the evaluation dataset
epoch=1: train_ppl=1.7650914597245262 train_epoch_loss=0.568202507583534 eval_ppl=1.1431496390290274 eval_epoch_loss=0.13378729402936396


100%|██████████| 340/340 [08:28<00:00,  1.49s/it, train-loss=0.31]
100%|██████████| 38/38 [00:18<00:00,  2.02it/s, eval-loss=2.80] 


accuracy=87.71929824561403 % on the evaluation dataset
epoch=2: train_ppl=1.367369146770264 train_epoch_loss=0.31288856281625 eval_ppl=1.0764534794629177 eval_epoch_loss=0.07367182226975415


In [8]:
# saving model
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)

In [9]:
ckpt = f"{peft_model_id}/adapter_model.ckpt"
!du -h $ckpt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2468.52s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


904K	bigscience/mt0-large_LOKR_SEQ_2_SEQ_LM/adapter_model.ckpt


In [5]:
from mindnlp.peft import PeftModel, PeftConfig
import logging
logging.getLogger('mindspore').setLevel(logging.ERROR)
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)



In [6]:
i = 13
def load_dataset(filepath):
    sentences = []
    labels = []
    with open(filepath, encoding="iso-8859-1") as f:
        for line in f:
            sentence, label = line.strip().split("@")
            sentences.append(sentence)
            labels.append(label)
    
    dataset = datasets.Dataset.from_dict({
        "sentence": sentences,
        "label": labels,
    })
    return dataset

dataset = load_dataset("/root/test/FinancialPhraseBank-v1.0/Sentences_AllAgree.txt")
dataset = dataset.train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]

inputs = tokenizer(dataset["validation"][text_column][i], return_tensors="ms")
print(dataset["validation"][text_column][i])
print(inputs)

outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
print(outputs)
print(tokenizer.batch_decode(outputs ,skip_special_tokens=True))

The share capital of Alma Media Corporation (business ID 1944757-4)is EUR 45,031,513.80 and it is divided into 75,052,523 shares .
{'input_ids': Tensor(shape=[1, 35], dtype=Int64, value=
[[ 486, 8364, 8646 ...  259,  260,    1]]), 'attention_mask': Tensor(shape=[1, 35], dtype=Int64, value=
[[1, 1, 1 ... 1, 1, 1]])}




[[    0 59006     1]]
['neutral']
