In [1]:
import os

import mindspore
from mindspore.dataset import text, GeneratorDataset, transforms
from mindspore import nn

from mindnlp import load_dataset
from mindnlp.transforms import PadTransform, GPTTokenizer

from mindnlp.engine import Trainer, Evaluator
from mindnlp.engine.callbacks import CheckpointCallback, BestModelCallback
from mindnlp.metrics import Accuracy

[ERROR] ME(19606:140553655732032,MainProcess):2023-05-06-18:07:00.195.685 [mindspore/run_check/_check_version.py:226] Cuda ['10.1', '11.1', '11.6'] version(libcu*.so need by mindspore-gpu) is not found. Please confirm that the path of cuda is set to the env LD_LIBRARY_PATH, or check whether the CUDA version in wheel package and the CUDA runtime in current device matches. Please refer to the installation guidelines: https://www.mindspore.cn/install
[ERROR] ME(19606:140553655732032,MainProcess):2023-05-06-18:07:00.218.403 [mindspore/run_check/_check_version.py:226] Cuda ['10.1', '11.1', '11.6'] version(libcudnn*.so need by mindspore-gpu) is not found. Please confirm that the path of cuda is set to the env LD_LIBRARY_PATH, or check whether the CUDA version in wheel package and the CUDA runtime in current device matches. Please refer to the installation guidelines: https://www.mindspore.cn/install
  from tqdm.autonotebook import tqdm


In [2]:
imdb_train, imdb_test = load_dataset('imdb', shuffle=False)

In [3]:
import numpy as np

def process_dataset(dataset, tokenizer, max_seq_len=256, batch_size=32, shuffle=False):
    def pad_sample(text):
        if len(text) + 2 >= max_seq_len:
            return np.concatenate(
                [np.array([tokenizer.bos_token_id]), text[: max_seq_len-2], np.array([tokenizer.eos_token_id])]
            )
        else:
            pad_len = max_seq_len - len(text) - 2
            return np.concatenate( 
                [np.array([tokenizer.bos_token_id]), text,
                 np.array([tokenizer.eos_token_id]),
                 np.array([tokenizer.pad_token_id] * pad_len)]
            )

    column_names = ["text", "label"]
    rename_columns = ["input_ids", "label"]

    if shuffle:
        dataset = dataset.shuffle(batch_size)

    # map dataset
    dataset = dataset.map(operations=[tokenizer, pad_sample], input_columns="text")
    # rename dataset
    dataset = dataset.rename(input_columns=column_names, output_columns=rename_columns)
    # batch dataset
    dataset = dataset.batch(batch_size)

    return dataset

In [4]:
# tokenizer
gpt_tokenizer = GPTTokenizer.from_pretrained('openai-gpt')

# add sepcial token: <PAD>
special_tokens_dict = {
    "bos_token": "<bos>",
    "eos_token": "<eos>",
    "pad_token": "<pad>",
}
num_added_toks = gpt_tokenizer.add_special_tokens(special_tokens_dict)

In [5]:
# split train dataset into train and valid datasets
imdb_train, imdb_val = imdb_train.split([0.7, 0.3])

In [6]:
dataset_train = process_dataset(imdb_train, gpt_tokenizer, shuffle=True)
dataset_val = process_dataset(imdb_val, gpt_tokenizer)
dataset_test = process_dataset(imdb_test, gpt_tokenizer)

In [7]:
from mindnlp.models import GPTForSequenceClassification
from mindnlp._legacy.amp import auto_mixed_precision

# set bert config and define parameters for training
model = GPTForSequenceClassification.from_pretrained('openai-gpt', num_labels=2)
model.pad_token_id = gpt_tokenizer.pad_token_id
model.resize_token_embeddings(model.config.vocab_size + 3)
model = auto_mixed_precision(model, 'O1')

loss = nn.CrossEntropyLoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=2e-5)

metric = Accuracy()

# define callbacks to save checkpoints
ckpoint_cb = CheckpointCallback(save_path='checkpoint', ckpt_name='sentiment_model', epochs=1, keep_checkpoint_max=2)
best_model_cb = BestModelCallback(save_path='checkpoint', auto_load=True)

trainer = Trainer(network=model, train_dataset=dataset_train,
                  eval_dataset=dataset_val, metrics=metric,
                  epochs=3, loss_fn=loss, optimizer=optimizer, callbacks=[ckpoint_cb, best_model_cb],
                  jit=True)



In [8]:
# start training
trainer.run(tgt_columns="label")

The train will start from the checkpoint saved in 'checkpoint'.


  0%|          | 0/547 [00:00<?, ?it/s]

Checkpoint: 'sentiment_model_epoch_0.ckpt' has been saved in epoch: 0.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9098666666666667}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 0.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

Checkpoint: 'sentiment_model_epoch_1.ckpt' has been saved in epoch: 1.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9397333333333333}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 1.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_2.ckpt' has been saved in epoch: 2.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9598666666666666}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 2.---------------
Loading best model from 'checkpoint' with '['Accuracy']': [0.9598666666666666]...
---------------The model is already load the best model from 'best_so_far.ckpt'.---------------


In [9]:
evaluator = Evaluator(network=model, eval_dataset=dataset_test, metrics=metric)
evaluator.run(tgt_columns="label")

  0%|          | 0/782 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.91828}
