# 基于MindSpore通过GPT实现情感分类

设置pip源为清华源，或者其他任何一个可用的源

In [1]:
!pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple

Writing to /home/mindspore/.config/pip/pip.conf


安装mindspore和mindnlp等包

In [2]:
%%capture captured_output
# 实验环境已经预装了mindspore==2.3.0，如需更换mindspore版本，可更改下面 MINDSPORE_VERSION 变量
!pip uninstall mindspore -y
!export MINDSPORE_VERSION=2.3.1
!pip install https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MINDSPORE_VERSION}/MindSpore/unified/aarch64/mindspore-${MINDSPORE_VERSION}-cp39-cp39-linux_aarch64.whl --trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com -i https://pypi.mirrors.ustc.edu.cn/simple

In [3]:
#安装mindnlp的daily包，待正式发布后可改为直接安装mindnlp包
!pip install https://repo.mindspore.cn/mindspore-lab/mindnlp/newest/any/mindnlp-0.4.0-py3-none-any.whl
# !pip install mindnlp==0.4.0
!pip install jieba
%env HF_ENDPOINT=https://hf-mirror.com

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting mindnlp==0.4.0
  Using cached https://repo.mindspore.cn/mindspore-lab/mindnlp/newest/any/mindnlp-0.4.0-py3-none-any.whl (7.6 MB)
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
env: HF_ENDPOINT=https://hf-mirror.com


如果import出错，请检查mindnlp与mindspore版本。该例子在python3.9 + mindspore2.3.1 + mindnlp0.4.0于2024.8.16的daily包能完整运行。

In [4]:
import os

import mindspore
from mindspore.dataset import text, GeneratorDataset, transforms
from mindspore import nn

from mindnlp.dataset import load_dataset

from mindnlp.engine import Trainer

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.049 seconds.
Prefix dict has been built successfully.


载入与处理数据集

In [5]:
imdb_ds = load_dataset('imdb', split=['train', 'test'])
imdb_train = imdb_ds['train']
imdb_test = imdb_ds['test']

In [6]:
imdb_train.get_dataset_size()

25000

In [7]:
import numpy as np

def process_dataset(dataset, tokenizer, max_seq_len=512, batch_size=4, shuffle=False):
    is_ascend = mindspore.get_context('device_target') == 'Ascend'
    def tokenize(text):
        if is_ascend:
            tokenized = tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_len)
        else:
            tokenized = tokenizer(text, truncation=True, max_length=max_seq_len)
        return tokenized['input_ids'], tokenized['attention_mask']

    if shuffle:
        dataset = dataset.shuffle(batch_size)

    # map dataset
    dataset = dataset.map(operations=[tokenize], input_columns="text", output_columns=['input_ids', 'attention_mask'])
    dataset = dataset.map(operations=transforms.TypeCast(mindspore.int32), input_columns="label", output_columns="labels")
    # batch dataset
    if is_ascend:
        dataset = dataset.batch(batch_size)
    else:
        dataset = dataset.padded_batch(batch_size, pad_info={'input_ids': (None, tokenizer.pad_token_id),
                                                             'attention_mask': (None, 0)})

    return dataset

载入tokenizer并添加token，用于标记开始（bos）、结束（eos）与填充（pad）。

In [8]:
from mindnlp.transformers import GPTTokenizer
# tokenizer
gpt_tokenizer = GPTTokenizer.from_pretrained('openai-gpt')

# add sepcial token: <PAD>
special_tokens_dict = {
    "bos_token": "<bos>",
    "eos_token": "<eos>",
    "pad_token": "<pad>",
}
num_added_toks = gpt_tokenizer.add_special_tokens(special_tokens_dict)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'OpenAIGPTTokenizer'. 
The class this function is called from is 'GPTTokenizer'.
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


由于原数据集没有验证集，故将原训练集重新划分为训练集与验证集

In [9]:
imdb_train, imdb_val = imdb_train.split([0.7, 0.3])

In [10]:
dataset_train = process_dataset(imdb_train, gpt_tokenizer, shuffle=True)
dataset_val = process_dataset(imdb_val, gpt_tokenizer)
dataset_test = process_dataset(imdb_test, gpt_tokenizer)

In [11]:
next(dataset_train.create_tuple_iterator())

[Tensor(shape=[4, 512], dtype=Int64, value=
 [[  655,   668,  1532 ... 40480, 40480, 40480],
  [  244,  5713,  9173 ...   260,  2903, 26989],
  [  244,   481,  3080 ... 40480, 40480, 40480],
  [  616,   544,   246 ... 40480, 40480, 40480]]),
 Tensor(shape=[4, 512], dtype=Int64, value=
 [[1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 1, 1, 1],
  [1, 1, 1 ... 0, 0, 0],
  [1, 1, 1 ... 0, 0, 0]]),
 Tensor(shape=[4], dtype=Int32, value= [0, 1, 1, 1])]

加载GPT序列分类模型，设置为二分类

In [12]:
from mindnlp.transformers import GPTForSequenceClassification

# set GPT config and define parameters for training
model = GPTForSequenceClassification.from_pretrained('openai-gpt', num_labels=2)
model.config.pad_token_id = gpt_tokenizer.pad_token_id
model.resize_token_embeddings(model.config.vocab_size + 3)



[MS_ALLOC_CONF]Runtime config:  enable_vmm:True  vmm_align_size:2MB


Some weights of GPTForSequenceClassification were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<mindnlp.core.nn.modules.sparse.Embedding at 0xfffec0c31340>

使用TrainArguments定义训练参数

In [13]:
from mindnlp.engine import TrainingArguments

training_args = TrainingArguments(
    output_dir="gpt_imdb_finetune",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=1.0
)


设置训练与评估指标

In [14]:
from mindnlp import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [15]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val, 
    compute_metrics=compute_metrics
)


In [None]:
# start training
trainer.train()

  0%|          | 0/4375 [00:00<?, ?it/s]

{'loss': 0.4192, 'learning_rate': 0.0, 'epoch': 1.0}
|

  0%|          | 0/1875 [00:00<?, ?it/s]

{'eval_loss': 0.31375083327293396, 'eval_accuracy': 0.9249333333333334, 'eval_runtime': 127.0515, 'eval_samples_per_second': 14.758, 'eval_steps_per_second': 1.85, 'epoch': 1.0}
{'train_runtime': 1420.832, 'train_samples_per_second': 12.317, 'train_steps_per_second': 3.079, 'train_loss': 0.41919162946428573, 'epoch': 1.0}


TrainOutput(global_step=4375, training_loss=0.41919162946428573, metrics={'train_runtime': 1420.832, 'train_samples_per_second': 12.317, 'train_steps_per_second': 3.079, 'train_loss': 0.41919162946428573, 'epoch': 1.0})

In [17]:
trainer.evaluate(dataset_test)

  0%|          | 0/6250 [00:00<?, ?it/s]

{'eval_loss': 0.308649480342865,
 'eval_accuracy': 0.925,
 'eval_runtime': 417.9591,
 'eval_samples_per_second': 14.954,
 'eval_steps_per_second': 1.871,
 'epoch': 1.0}