In [1]:
import csv
import sys
import copy
import json
import os

class InputExample(object):
    """
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
    def __init__(self, guid, text_a, text_b=None, label=None):
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


class InputFeatures(object):
    """
    A single set of features of data.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
        label: Label corresponding to the input
    """

    def __init__(self, input_ids, attention_mask, token_type_ids, label,input_len):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.input_len = input_len
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                lines.append(line)
            return lines

    @classmethod
    def _read_txt(cls, input_file):
        """Reads a tab separated value file."""
        with open(input_file, "r") as f:
            reader = f.readlines()
            lines = []
            for line in reader:
                lines.append(line.strip().split("_!_"))
            return lines
        

class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            text_b = line[4]
            label = line[0]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples

def glue_convert_examples_to_features(examples, tokenizer,
                                      max_seq_length=512,
                                      task=None,
                                      label_list=None,
                                      output_mode=None):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))

        tokenizer.return_token = True
        tokens_a = tokenizer.execute_py(example.text_a)
        tokens_b  =None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        tokens = []
        token_type_ids = []
        for token in tokens_a:
            tokens.append(token)
            token_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                token_type_ids.append(1)
            tokens.append("[SEP]")
            token_type_ids.append(1)

        tokenizer.return_token=False
        input_ids = tokenizer.execute_py(example.text_a).tolist()
        # print(input_ids)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1] * len(input_ids)
        input_len = len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            attention_mask.append(0)
            token_type_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(attention_mask) == max_seq_length
        assert len(token_type_ids) == max_seq_length
        if output_mode == "classification":
            label_id = label_map[example.label]
        elif output_mode == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(output_mode)
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s"%" ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))
            logger.info("input length: %d" % (input_len))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label=label_id,
                          input_len=input_len))
    return features

In [None]:
def load_and_cache_examples(args, task, tokenizer, data_type='train'):
    """load_and_cache_examples"""
    processor: MrpcProcessor = MrpcProcessor()
    output_mode = "classification"
    # Load data features from dataset file
    label_list = processor.get_labels()
    if task in ['mnli', 'mnli-mm'] and 'roberta' in args.model_type:
        # HACK(label indices are swapped in RoBERTa pretrained model)
        label_list[1], label_list[2] = label_list[2], label_list[1]

    if data_type == 'train':
        examples = processor.get_train_examples(args.data_dir)
    elif data_type == 'dev':
        examples = processor.get_dev_examples(args.data_dir)
    else:
        examples = processor.get_test_examples(args.data_dir)

    features = convert_examples_to_features(examples,
                                            tokenizer,
                                            label_list=label_list,
                                            max_seq_length=args.max_seq_length,
                                            output_mode=output_mode)

    # Convert to Tensors and build dataset
    all_input_ids = [f.input_ids for f in features]
    all_attention_mask = [f.attention_mask for f in features]
    all_token_type_ids = [f.token_type_ids for f in features]
    all_lens = [f.input_len for f in features]
    all_labels = [f.label for f in features]

    dataset = ((all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels))
    return dataset


In [1]:
import argparse
import os

import mindspore
from mindspore.nn import AdamWeightDecay
from tqdm import tqdm
# from mindnlp.metrics import 

from mindnlp.peft import (
    get_peft_config,
    get_peft_model,
    LoraConfig,
    PeftType,
)


  from tqdm.autonotebook import tqdm


# Hyper Parameter

In [3]:
batch_size = 32
model_name_or_path = "roberta-base"
task = "mrpc"
peft_type = PeftType.LORA
device = "GPU" # "cuda"
num_epochs = 20
lr = 3e-4

## Prepare dataset

In [5]:
from mindspore.dataset import text 


from mindnlp import load_dataset, process
from mindnlp.transforms import RobertaTokenizer, PadTransform
from mindnlp.models import RobertaConfig, RobertaForSequenceClassification
from mindnlp.dataset import MRPC, MRPC_Process

# load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# load dataset
mrpc_train, mrpc_test = MRPC(root='/home/cjl/')

def look_ds(ds, ds_tag='train'):
    """Take a brief look at dataset."""
    print(f"{ds_tag} dataset length: ", len(ds))
    # print(f"{ds_tag} dataset colums: ", ds.column_names)

    iter = ds.create_tuple_iterator()
    for i, (l, s1, s2) in enumerate(iter):
        if i < 10:
            print(l, s1, s2)

look_ds(mrpc_train, 'train')
# look_ds(mrpc_test, 'test')

vocab = text.Vocab.from_dataset(mrpc_train, columns=['sentence1', 'sentence2'], 
                                special_tokens=["<pad>","<unk>"], special_first=True)


train dataset length:  4076
1 Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence . Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
0 Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion . Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 .
1 They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added . On June 10 , the ship 's owners had published an advertisement on the Internet , offering the explosives for sale .
0 Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A $ 4.56 , having earlier set a record high of A $ 4.57 . Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .
1 The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange . PG & E Corp. shares j

In [None]:

import numpy as np

def process_dataset(ds, ds_tag, tokenizer, vocab, col_names, batch_size=8, max_seq_len=512, shuffle=False):
    # tokenize dataset
    for col in col_names:
        ds = ds.map(tokenizer, input_columns=col)

    def concatenate_sentence(x, y):
        return np.concatenate((x, y))
    # append sentence1 & sentence2
    ds = ds.map(concatenate_sentence, input_columns=col_names, output_columns='merged')

    # pad dataset
    pad_value = vocab.tokens_to_ids('<pad>')
    pad_op = PadTransform(max_seq_len, pad_value, return_length=False)
    
    ds = ds.map(pad_op, input_columns='merged')

    # batch dataset
    ds = ds.batch(batch_size, drop_remainder=True)

    return ds

mrpc_train = process_dataset(mrpc_train, 'test', tokenizer, vocab, mrpc_train.column_names[1:], batch_size=8)

for input_ids, label in mrpc_train:
    print(input_ids.shape, label.shape)
    break

In [None]:
# mask ?

## Prepare model

In [None]:
"""
Here we identify the class InputBuilder as a utility class to prepare the input of RoBERTa model from glue/MRPC dataset.
We also identify a wrapper for the glue/MRPC dataset
"""
import torch


class InputBuilder:
    def __init__(self, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.max_len = max_len

    def truncate_pair_of_tokens(self, tokens_a, tokens_b):
        while True:
            total_length = len(tokens_a) + len(tokens_b)
            if total_length <= self.max_len - 3:
                break
            if len(tokens_a) > len(tokens_b):
                tokens_a.pop()
            else:
                tokens_b.pop()

    def build_features(self, example):
        tokens_a = self.tokenizer.tokenize(example["sentence1"])
        tokens_b = self.tokenizer.tokenize(example["sentence2"])
        self.truncate_pair_of_tokens(tokens_a, tokens_b)
        tokens = []
        # tokens.append("[CLS]")
        tokens.append(self.tokenizer.cls_token)
        for token in tokens_a:
            tokens.append(token)
        # tokens.append("[SEP]")
        tokens.append(self.tokenizer.sep_token)
        for token in tokens_b:
            tokens.append(token)
        # tokens.append("[SEP]")
        tokens.append(self.tokenizer.sep_token)

        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_len:
            input_ids.append(0)
            input_mask.append(0)

        input_ids = torch.tensor(input_ids, dtype=torch.int64)
        input_mask = torch.tensor(input_mask, dtype=torch.float)
        label = torch.tensor(example["label"], dtype=torch.int64)
        return input_ids, input_mask, label


class GlueDataset(torch.utils.data.Dataset):
    def __init__(self, dataset, max_len, tokenizer):
        self.max_len = max_len
        self.dataset = dataset
        self.input_builder = InputBuilder(tokenizer, max_len)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        input_ids, input_mask, label = self.input_builder.build_features(example)
        return input_ids, input_mask, label

    def __len__(self):
        return len(self.dataset)


In [None]:

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
cols = ['sentence1', 'sentence2']
def process_dataset(dataset, tokenizer, column_names, batch_size, max_seq_len=512, shuffle=False):
    # tokenize
    for col in column_names:
        dataset = dataset.map(tokenizer, input_columns=col)

    return dataset

ds = process_dataset(mrpc_train, tokenizer, column_names=cols, batch_size=batch_size)



## roberta-base model from pretrained

In [None]:

model_config = RobertaConfig(num_labels=2)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', config=model_config )

# model

In [None]:
peft_config = LoraConfig(task_type="SEQ_CLS", inference_mode=False, r=8, lora_alpha=16, lora_dropout=0.1)
peft_model = get_peft_model(model, peft_config)
# model.print_train_parameters()