In [1]:
import os

import mindspore
from mindspore.dataset import text, GeneratorDataset, transforms
from mindspore import nn

from mindnlp.transforms import PadTransform
from mindnlp.transforms.tokenizers import GPTTokenizer

from mindnlp.engine import Trainer, Evaluator
from mindnlp.engine.callbacks import CheckpointCallback, BestModelCallback
from mindnlp.metrics import Accuracy

[ERROR] ME(45742:139774497265472,MainProcess):2023-05-06-03:06:22.469.759 [mindspore/run_check/_check_version.py:226] Cuda ['10.1', '11.1', '11.6'] version(libcu*.so need by mindspore-gpu) is not found. Please confirm that the path of cuda is set to the env LD_LIBRARY_PATH, or check whether the CUDA version in wheel package and the CUDA runtime in current device matches. Please refer to the installation guidelines: https://www.mindspore.cn/install
[ERROR] ME(45742:139774497265472,MainProcess):2023-05-06-03:06:22.494.321 [mindspore/run_check/_check_version.py:226] Cuda ['10.1', '11.1', '11.6'] version(libcudnn*.so need by mindspore-gpu) is not found. Please confirm that the path of cuda is set to the env LD_LIBRARY_PATH, or check whether the CUDA version in wheel package and the CUDA runtime in current device matches. Please refer to the installation guidelines: https://www.mindspore.cn/install
  from tqdm.autonotebook import tqdm


In [2]:
import re
import six
import string
import tarfile

class SentimentDataset():
    """IMDB数据集加载器

    加载IMDB数据集并处理为一个Python迭代对象。

    """
    label_map = {
        "pos": 1,
        "neg": 0
    }
    def __init__(self, path, mode="train"):
        self.mode = mode
        self.path = path
        self.docs, self.labels = [], []

        self._load("pos")
        self._load("neg")

    def _load(self, label):
        pattern = re.compile(r"aclImdb/{}/{}/.*\.txt$".format(self.mode, label))
        # 将数据加载至内存
        with tarfile.open(self.path) as tarf:
            tf = tarf.next()
            while tf is not None:
                if bool(pattern.match(tf.name)):
                    # 对文本进行分词、去除标点和特殊字符、小写处理
                    self.docs.append(str(tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
                                         .translate(None, six.b(string.punctuation)).lower()).split())
                    self.labels.append([self.label_map[label]])
                tf = tarf.next()

    def __getitem__(self, idx):
        return self.labels[idx][0], self.docs[idx]

    def __len__(self):
        return len(self.docs)


In [3]:
import os
import shutil
import requests
import tempfile
from tqdm import tqdm
from typing import IO
from pathlib import Path

# 指定保存路径为 `./mindspore_examples`
cache_dir = './mindspore_examples'

def http_get(url: str, temp_file: IO):
    """使用requests库下载数据，并使用tqdm库进行流程可视化"""
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = tqdm(unit='B', total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk:
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()

def download(file_name: str, url: str):
    """下载数据并存为指定名称"""
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    cache_path = os.path.join(cache_dir, file_name)
    cache_exist = os.path.exists(cache_path)
    if not cache_exist:
        with tempfile.NamedTemporaryFile() as temp_file:
            http_get(url, temp_file)
            temp_file.flush()
            temp_file.seek(0)
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)
    return cache_path


In [4]:
imdb_path = download('aclImdb_v1.tar.gz', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/aclImdb_v1.tar.gz')
imdb_path

'./mindspore_examples/aclImdb_v1.tar.gz'

In [5]:
def process_dataset(dataset, tokenizer, pad_value, max_seq_len=64, batch_size=32, shuffle=False):
    column_names = ["label", "text"]
    rename_columns = ["label", "input_ids"]

    if shuffle:
        dataset = dataset.shuffle(32)
    # transforms
    pad_op = PadTransform(max_seq_len, pad_value=pad_value)
    type_cast_op = transforms.TypeCast(mindspore.int32)
    
    # map dataset
    dataset = dataset.map(operations=[tokenizer, pad_op], input_columns="text")
    dataset = dataset.map(operations=[type_cast_op], input_columns="label")
    # rename dataset
    dataset = dataset.rename(input_columns=column_names, output_columns=rename_columns)
    # batch dataset
    dataset = dataset.batch(batch_size)

    return dataset

In [6]:
# tokenizer
gpt_tokenizer = GPTTokenizer.from_pretrained('openai-gpt')

# add sepcial token: <PAD>
special_tokens_dict = {"pad_token": "<pad>"}
num_added_toks = gpt_tokenizer.add_special_tokens(special_tokens_dict)
pad_value = gpt_tokenizer.token_to_id("<pad>")

In [7]:
dataset_train = GeneratorDataset(SentimentDataset(imdb_path, "train"), column_names=["label", "text"], shuffle=False)
dataset_train, dataset_val = dataset_train.split([0.7, 0.3])
# split train dataset into train and valid datasets
dataset_test = GeneratorDataset(SentimentDataset(imdb_path, "test"), column_names=["label", "text"], shuffle=False)

In [8]:
dataset_train = process_dataset(dataset_train, gpt_tokenizer, pad_value, shuffle=True)
dataset_val = process_dataset(dataset_val, gpt_tokenizer, pad_value)
dataset_test = process_dataset(dataset_test, gpt_tokenizer, pad_value)

In [9]:
from mindnlp.models import GPTForSequenceClassification
from mindnlp._legacy.amp import auto_mixed_precision

# set bert config and define parameters for training
model = GPTForSequenceClassification.from_pretrained('openai-gpt', num_labels=2)
model.pad_token_id = pad_value
model = auto_mixed_precision(model, 'O1')

loss = nn.CrossEntropyLoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=2e-5)

metric = Accuracy()

# define callbacks to save checkpoints
ckpoint_cb = CheckpointCallback(save_path='checkpoint', ckpt_name='sentiment_model', epochs=1, keep_checkpoint_max=2)
best_model_cb = BestModelCallback(save_path='checkpoint', auto_load=True)

trainer = Trainer(network=model, train_dataset=dataset_train,
                  eval_dataset=dataset_val, metrics=metric,
                  epochs=10, loss_fn=loss, optimizer=optimizer, callbacks=[ckpoint_cb, best_model_cb],
                  jit=True)



In [10]:
# start training
trainer.run(tgt_columns="label")

The train will start from the checkpoint saved in 'checkpoint'.


  0%|          | 0/547 [00:00<?, ?it/s]

Checkpoint: 'sentiment_model_epoch_0.ckpt' has been saved in epoch: 0.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.7513333333333333}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 0.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

Checkpoint: 'sentiment_model_epoch_1.ckpt' has been saved in epoch: 1.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.7962666666666667}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 1.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_2.ckpt' has been saved in epoch: 2.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.8473333333333334}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 2.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_3.ckpt' has been saved in epoch: 3.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.87}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 3.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_4.ckpt' has been saved in epoch: 4.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9198666666666667}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 4.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_5.ckpt' has been saved in epoch: 5.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9454666666666667}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 5.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_6.ckpt' has been saved in epoch: 6.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9653333333333334}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 6.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_7.ckpt' has been saved in epoch: 7.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9757333333333333}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 7.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_8.ckpt' has been saved in epoch: 8.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9892}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 8.---------------


  0%|          | 0/547 [00:00<?, ?it/s]

The maximum number of stored checkpoints has been reached.
Checkpoint: 'sentiment_model_epoch_9.ckpt' has been saved in epoch: 9.


  0%|          | 0/235 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.9937333333333334}
---------------Best Model: 'best_so_far.ckpt' has been saved in epoch: 9.---------------
Loading best model from 'checkpoint' with '['Accuracy']': [0.9937333333333334]...
---------------The model is already load the best model from 'best_so_far.ckpt'.---------------


In [11]:
evaluator = Evaluator(network=model, eval_dataset=dataset_test, metrics=metric)
evaluator.run(tgt_columns="label")

  0%|          | 0/782 [00:00<?, ?it/s]

Evaluate Score: {'Accuracy': 0.74812}
