In [1]:
import os
import mindspore
from mindspore import ops
from mindspore.amp import StaticLossScaler
from mindnlp.dataset import load_dataset
from mindnlp.core.serialization import safe_load_file
from mindnlp.transformers.models.gpt_neox import (
    GPTNeoXForCausalLM,
)
from mindnlp.engine import TrainingArguments, Trainer
from mindnlp.transformers import AutoTokenizer
from mindnlp.peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.307 seconds.
Prefix dict has been built successfully.


In [2]:
# Replace the HuggingFace download link with hf-mirror.
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m-deduped")

In [3]:
max_length = 512
lora_r = 8
lora_alpha = 16
lora_dropout = 0.1

lr = 1e-4
weight_decay = 0.01
num_train_epochs = 1
batch_size = 2

# Log the progress every set steps
logging_steps = 100
# Save the model every set steps
save_steps = 500
save_total_limit = 4

# Set scaling value for loss scale
scale_value = 2**5
# Set label smoothing
label_smoothing_factor = 0.0

output_dir = "output"
# resume_from_checkpoint = os.path.join(output_dir, "checkpoint-3500")
resume_from_checkpoint = None

In [7]:
model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-410m-deduped")

In [8]:
# Load Belle dataset
# https://huggingface.co/datasets/BelleGroup/train_0.5M_CN/blob/main/Belle_open_source_0.5M.json
ds = load_dataset("BelleGroup/train_0.5M_CN")
# ds = load_dataset("./data")

In [9]:
# View dataset information.
print(ds.column_names)
print(type(ds.source[0][0]), ds.source[0])

['instruction', 'input', 'output']
<class 'str'> ('给定一个英文句子，翻译成中文。\nI love to learn new things every day.\n', '', '我每天喜欢学习新事物。')


In [10]:
# Splitting the training set and test set.
train_dataset, eval_dataset = ds.split([0.9, 0.1])

In [11]:
# Set Prompt
BOS_TOKEN = tokenizer.bos_token
EOS_TOKEN = tokenizer.eos_token
def add_result_token(instruction, input, output):
    # Retrieve specific value through item.
    instruction = instruction.item()
    output = output.item()
    input_text = "Human: " + instruction + "\n\nAssistant: "
    input_text = BOS_TOKEN + input_text if BOS_TOKEN != None else input_text
    response = input_text + output + EOS_TOKEN
    return response

train_dataset = train_dataset.map(add_result_token, input_columns=['instruction', 'input', 'output'], output_columns=['inputs'])
eval_dataset = eval_dataset.map(add_result_token, ['instruction', 'input', 'output'], ['inputs'])

In [12]:
next(train_dataset.create_dict_iterator())

{'inputs': Tensor(shape=[], dtype=String, value= '<|endoftext|>Human: 给定一组数据，请确定其中是否存在异常值。\n数据: 10, 20, 15, 25, 30, 500\n\nAssistant: 是，存在异常值500。<|endoftext|>')}

In [13]:
# set pad_token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

In [14]:
def tokenize_prompt(inputs):
    """
    Convert the sentence into token ids using a tokenizer.
    """
    result = tokenizer(
        inputs.item(),
        truncation=True,
        max_length=max_length,
        padding=False,
    )
    result["labels"] = result["input_ids"].copy()
    return result["input_ids"], result["attention_mask"], result["labels"]

def dataset_batch(dataset, shuffle=False, buffer_size=16):
    """
    Split the dataset into batches and use dynamic padding to ensure
    that each batch has a consistent length.
    """
    if shuffle:
        dataset = dataset.shuffle(buffer_size).map(
            tokenize_prompt,
            ["inputs"],
            ["input_ids", "attention_mask", "labels"]
        )
    else:
        dataset = dataset.map(
            tokenize_prompt,
            ["inputs"],
            ["input_ids", "attention_mask", "labels"]
        )

    dataset = dataset.padded_batch(
        batch_size,
        pad_info={
            'input_ids': (None, tokenizer.pad_token_id),
            'attention_mask': (None, 0),
            'labels': (None, tokenizer.pad_token_id)
        }
    )
    return dataset

In [15]:
# train_dataset = dataset_batch(train_dataset, shuffle=True)
train_dataset = dataset_batch(train_dataset)
eval_dataset = dataset_batch(eval_dataset)

In [16]:
# View the input token.
next(train_dataset.create_dict_iterator())

{'input_ids': Tensor(shape=[2, 149], dtype=Int64, value=
 [[    0, 22705,    27 ... 14318,   111,     0],
  [    0, 22705,    27 ...     0,     0,     0]]),
 'attention_mask': Tensor(shape=[2, 149], dtype=Int64, value=
 [[1, 1, 1 ... 1, 1, 1],
  [1, 1, 1 ... 0, 0, 0]]),
 'labels': Tensor(shape=[2, 149], dtype=Int64, value=
 [[    0, 22705,    27 ... 14318,   111,     0],
  [    0, 22705,    27 ...     0,     0,     0]])}

In [17]:
# creating peft model
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    )

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 786,432 || all params: 406,120,448 || trainable%: 0.1936450143973051


In [18]:
# Set Training Param
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=lr,
    weight_decay=weight_decay,
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    num_train_epochs=num_train_epochs,
    label_smoothing_factor=label_smoothing_factor,
)

In [19]:
class CausalLMTrainer(Trainer):
    """
    Used for GPTNeoX CausalLM training.
    """
    def __init__(
        self,
        model = None,
        args = None,
        map_fn = None,
        train_dataset = None,
        eval_dataset = None,
        tokenizer = None,
        model_init = None,
        compute_metrics = None,
        callbacks = None,
        optimizers = (None, None),
        preprocess_logits_for_metrics = None,
    ):
        """
        Modified from Trainer.
        """
        self.loss_scaler = StaticLossScaler(scale_value=scale_value)

        super().__init__(
            model = model,
            args = args,
            map_fn = map_fn,
            train_dataset = train_dataset,
            eval_dataset = eval_dataset,
            tokenizer = tokenizer,
            model_init = model_init,
            compute_metrics = compute_metrics,
            callbacks = callbacks,
            optimizers = optimizers,
            preprocess_logits_for_metrics = preprocess_logits_for_metrics,
        )

    def training_step(self, model, inputs):
        """
        Modified from Trainer.
        
        Perform a training step on a batch of inputs.
        """
        model.set_train()
        inputs = self._prepare_inputs(inputs)

        def forward(inputs):
            loss = self.compute_loss(model, inputs)
            # Loss scale
            loss = self.loss_scaler.scale(loss)
            return loss
        
        if getattr(self, 'grad_fn', None) is None or self.model_reload:
            self.grad_fn = mindspore.value_and_grad(forward, None, self.optimizer.parameters)

        loss, grads = self.grad_fn(inputs)
        # Try using Loss scale
        loss = self.loss_scaler.unscale(loss)
        grads = self.loss_scaler.unscale(grads)

        return loss / self.args.gradient_accumulation_steps, grads

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Modified from Trainer.

        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if self.label_smoother is not None and "labels" in inputs:
            labels = inputs.pop("labels")
        else:
            labels = None
        outputs = model(**inputs)
        # Save past state if it exists
        if self.args.past_index >= 0:
            self._past = outputs[self.args.past_index]

        if labels is not None:
            # unwrapped_model = self.accelerator.unwrap_model(model)
            loss = self.label_smoother(outputs, labels, shift_labels=True)
        else:
            shift_logits = outputs["logits"][:, :-1, :]
            labels = inputs["labels"][:, 1:]
            loss = ops.cross_entropy(shift_logits.view(-1, shift_logits.shape[-1]).to(mindspore.float32), labels.view(-1)).to(mindspore.float16)

        return (loss, outputs) if return_outputs else loss

In [20]:
trainer = CausalLMTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [21]:
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

  0%|          | 0/233665 [00:00<?, ?it/s]

{'loss': 2.5609, 'learning_rate': 9.999999747378752e-05, 'epoch': 0.0}
{'loss': 1.4842, 'learning_rate': 9.999999747378752e-05, 'epoch': 0.0}
{'loss': 1.4411, 'learning_rate': 9.999999747378752e-05, 'epoch': 0.0}


In [None]:
# Evaluate

# import math
# eval_results = trainer.evaluate()
# print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [4]:
# Perform inference on the trained model
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    )

In [5]:
# Set Prompt
input_text = "Human: " + "你是谁？\n" + "\n\nAssistant: "
input_text = tokenizer.bos_token + input_text if tokenizer.bos_token is not None else input_text

inputs = tokenizer(input_text, return_tensors="ms")

In [6]:
model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/pythia-410m-deduped")

In [7]:
output_ids = model.generate(**inputs, max_new_tokens=50)
output_str = tokenizer.batch_decode(output_ids)[0]
print(f"--- RAW_MODEL ---")
print(output_str)
print("---")

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


--- RAW_MODEL ---
<|endoftext|>Human: 你是谁？


Assistant: 你是谁？

Human: 你是谁？

Assistant: 你是谁？

Human: 你是谁？

Assistant: 你是谁？

Human
---


In [8]:
# Load peft model
model = get_peft_model(model, peft_config)

In [9]:
# Load the trained weights
state_dict = safe_load_file(os.path.join(output_dir, "checkpoint-14500/model.safetensors"))


In [10]:
# If the output is '([], [])', it means that all weights are loaded
model.load_state_dict(state_dict, False)

([], [])

In [11]:
output_ids = model.generate(**inputs, max_new_tokens=50)
output_str = tokenizer.batch_decode(output_ids)[0]
print(f"--- LORA_MODEL ---")
print(output_str)
print("---")

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


--- LORA_MODEL ---
<|endoftext|>Human: 你是谁？


Assistant: 我是一名AI语言模型，可以用于训练和测试。<|endoftext|>
---
