In [1]:
import mindspore as ms
import mindspore.nn as nn
from mindspore import ops
from mindspore.dataset import GeneratorDataset
import numpy as np
from mindnlp.engine import Trainer, TrainingArguments
from mindnlp.transformers import (
    BloomForCausalLM, 
    BloomConfig, 
    BloomTokenizerFast,
)
from mindnlp.peft import LoraConfig, get_peft_model
import gc
from datasets import load_dataset

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)


In [12]:
# 设置运行模式和设备
ms.set_context(mode=ms.PYNATIVE_MODE, device_target="Ascend")

# 基本配置
MODEL = "bigscience/bloom-3b"
DATASET = "databricks/databricks-dolly-15k"
TOKENS = 20
EPOCHS = 10
BATCH_SIZE = 4

In [3]:
# 加载模型和tokenizer
tokenizer = BloomTokenizerFast.from_pretrained(MODEL)
config = BloomConfig.from_pretrained(MODEL)

In [4]:
# 加载基础模型
base_model = BloomForCausalLM.from_pretrained(
    MODEL
)

# 配置LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# 将基础模型包装为LoRA模型
model = get_peft_model(base_model, lora_config)

BloomForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`.`PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [5]:
# 确保有pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [6]:
# 加载数据集
dataset = load_dataset(DATASET)

In [7]:
def format_prompt(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    sample["prompt"] = prompt
    return sample

# 处理数据集
dataset = dataset.map(format_prompt)
dataset = dataset.remove_columns(['instruction', 'context', 'response', 'category'])

train_samples = dataset["train"].select(range(0,40))
eval_samples = dataset["train"].select(range(40,50))

In [8]:
class TextDataset:
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        index = int(index)
        text = self.data[index]["prompt"]
        inputs = tokenizer(text, padding='max_length', max_length=256, truncation=True)
        
        # 使用相同的input_ids作为labels
        return (
            inputs["input_ids"], 
            inputs["attention_mask"],
            inputs["input_ids"]  # 添加labels
        )

    def __len__(self):
        return len(self.data)

In [9]:
# 创建数据集
train_dataset = GeneratorDataset(
    TextDataset(train_samples),
    column_names=["input_ids", "attention_mask", "labels"],  # 添加labels
    shuffle=True
)
eval_dataset = GeneratorDataset(
    TextDataset(eval_samples),
    column_names=["input_ids", "attention_mask", "labels"],  # 添加labels
    shuffle=False
)

In [13]:
# 定义训练参数
training_args = TrainingArguments(
    output_dir='./work',
    auto_find_batch_size=True,
    learning_rate=2e-4,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,  # 启用混合精度训练
)

# 创建trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=None
)

In [14]:
# 训练模型
trainer.train()

  0%|          | 0/100 [00:00<?, ?it/s]

Using `past_key_values` as a tuple is deprecated. Please use an appropriate `Cache` class


{'loss': 8.5685, 'learning_rate': 0.00018, 'epoch': 1.0}
/

  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 6.447186470031738, 'eval_runtime': 8.8455, 'eval_samples_per_second': 0.339, 'eval_steps_per_second': 0.113, 'epoch': 1.0}
{'loss': 6.7008, 'learning_rate': 0.00016, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 5.485293388366699, 'eval_runtime': 0.4395, 'eval_samples_per_second': 6.826, 'eval_steps_per_second': 2.275, 'epoch': 2.0}
{'loss': 4.3543, 'learning_rate': 0.00014, 'epoch': 3.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.417534351348877, 'eval_runtime': 0.474, 'eval_samples_per_second': 6.329, 'eval_steps_per_second': 2.11, 'epoch': 3.0}
{'loss': 2.5833, 'learning_rate': 0.00012, 'epoch': 4.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.719986915588379, 'eval_runtime': 0.4308, 'eval_samples_per_second': 6.964, 'eval_steps_per_second': 2.321, 'epoch': 4.0}
{'loss': 1.7503, 'learning_rate': 0.0001, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.5181431770324707, 'eval_runtime': 0.4391, 'eval_samples_per_second': 6.832, 'eval_steps_per_second': 2.277, 'epoch': 5.0}
{'loss': 1.4438, 'learning_rate': 8e-05, 'epoch': 6.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.516507148742676, 'eval_runtime': 0.4309, 'eval_samples_per_second': 6.962, 'eval_steps_per_second': 2.321, 'epoch': 6.0}
{'loss': 1.3124, 'learning_rate': 6e-05, 'epoch': 7.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.4947118759155273, 'eval_runtime': 0.4491, 'eval_samples_per_second': 6.68, 'eval_steps_per_second': 2.227, 'epoch': 7.0}
{'loss': 1.2436, 'learning_rate': 4e-05, 'epoch': 8.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.664604425430298, 'eval_runtime': 0.4324, 'eval_samples_per_second': 6.937, 'eval_steps_per_second': 2.312, 'epoch': 8.0}
{'loss': 1.205, 'learning_rate': 2e-05, 'epoch': 9.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.468752145767212, 'eval_runtime': 0.4321, 'eval_samples_per_second': 6.943, 'eval_steps_per_second': 2.314, 'epoch': 9.0}
{'loss': 1.1878, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.4858415126800537, 'eval_runtime': 0.4329, 'eval_samples_per_second': 6.93, 'eval_steps_per_second': 2.31, 'epoch': 10.0}
{'train_runtime': 132.5653, 'train_samples_per_second': 3.017, 'train_steps_per_second': 0.754, 'train_loss': 3.0349820709228514, 'epoch': 10.0}


TrainOutput(global_step=100, training_loss=3.0349820709228514, metrics={'train_runtime': 132.5653, 'train_samples_per_second': 3.017, 'train_steps_per_second': 0.754, 'train_loss': 3.0349820709228514, 'epoch': 10.0})

In [15]:
log_history = trainer.state.log_history

In [16]:
# 绘制损失曲线
training_losses = [log["loss"] for log in log_history if "loss" in log]
validation_losses = [log["eval_loss"] for log in log_history if "eval_loss" in log]
epochs = range(1, len(training_losses) + 1)

In [17]:
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(epochs, training_losses, 'b-', label='Training Loss')
plt.plot(epochs, validation_losses, 'r-', label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.savefig('training_loss.png')
plt.close()

In [18]:
# 生成文本函数
def generate_text(input_text):
    input_tokens = tokenizer(input_text, return_tensors="ms")
    outputs = model.generate(
        input_ids=input_tokens["input_ids"],
        attention_mask=input_tokens["attention_mask"],
        max_new_tokens=TOKENS,
        repetition_penalty=1.5,
        eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [19]:
input_words = "What is the meaning of life?"
generated_text = generate_text(input_words)
print(generated_text)

['What is the meaning of life? What does it mean to be human and what makes us special from other animals, plants or even rocks']


In [20]:
# 保存模型
trainer.save_model("./lora_weights")

In [21]:
# 清理内存
del model
del trainer
del train_dataset
del eval_samples
gc.collect()

4191