In [None]:
# install mindnlp to finetune bert.
!pip install git+https://github.com/mindspore-lab/mindnlp/

In [1]:
import os

import mindspore
from mindspore.dataset import text, GeneratorDataset, transforms
from mindspore import nn

from mindnlp.transforms import PadTransform
from mindnlp.models import BertModel, BertConfig
from mindnlp.transforms.tokenizers import GPTTokenizer, BertTokenizer

from mindnlp.engine import Trainer, Evaluator
from mindnlp.engine.callbacks import CheckpointCallback, BestModelCallback
from mindnlp.metrics import Accuracy

ModuleNotFoundError: No module named 'mindspore'

In [2]:
import re
import six
import string
import tarfile

class SentimentDataset():
    """IMDB数据集加载器

    加载IMDB数据集并处理为一个Python迭代对象。

    """
    label_map = {
        "pos": 1,
        "neg": 0
    }
    def __init__(self, path, mode="train"):
        self.mode = mode
        self.path = path
        self.docs, self.labels = [], []

        self._load("pos")
        self._load("neg")

    def _load(self, label):
        pattern = re.compile(r"aclImdb/{}/{}/.*\.txt$".format(self.mode, label))
        # 将数据加载至内存
        with tarfile.open(self.path) as tarf:
            tf = tarf.next()
            while tf is not None:
                if bool(pattern.match(tf.name)):
                    # 对文本进行分词、去除标点和特殊字符、小写处理
                    self.docs.append(str(tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
                                         .translate(None, six.b(string.punctuation)).lower()).split())
                    self.labels.append([self.label_map[label]])
                tf = tarf.next()

    def __getitem__(self, idx):
        return self.labels[idx][0], self.docs[idx]

    def __len__(self):
        return len(self.docs)


In [3]:
import os
import shutil
import requests
import tempfile
from tqdm import tqdm
from typing import IO
from pathlib import Path

# 指定保存路径为 `./mindspore_examples`
cache_dir = './mindspore_examples'

def http_get(url: str, temp_file: IO):
    """使用requests库下载数据，并使用tqdm库进行流程可视化"""
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = tqdm(unit='B', total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk:
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()

def download(file_name: str, url: str):
    """下载数据并存为指定名称"""
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    cache_path = os.path.join(cache_dir, file_name)
    cache_exist = os.path.exists(cache_path)
    if not cache_exist:
        with tempfile.NamedTemporaryFile() as temp_file:
            http_get(url, temp_file)
            temp_file.flush()
            temp_file.seek(0)
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)
    return cache_path


In [4]:
imdb_path = download('aclImdb_v1.tar.gz', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/aclImdb_v1.tar.gz')
imdb_path

'./mindspore_examples/aclImdb_v1.tar.gz'

In [76]:
test_data = SentimentDataset(imdb_path, "test")
for label, text in test_data:
    print(f"label = {label}")
    print(f"text = {text}")
    break

label = 1
text = ["b'previous", 'reviewer', 'claudio', 'carvalho', 'gave', 'a', 'much', 'better', 'recap', 'of', 'the', 'films', 'plot', 'details', 'than', 'i', 'could', 'what', 'i', 'recall', 'mostly', 'is', 'that', 'it', 'was', 'just', 'so', 'beautiful', 'in', 'every', 'sense', 'emotionally', 'visually', 'editorially', 'just', 'gorgeousbr', 'br', 'if', 'you', 'like', 'movies', 'that', 'are', 'wonderful', 'to', 'look', 'at', 'and', 'also', 'have', 'emotional', 'content', 'to', 'which', 'that', 'beauty', 'is', 'relevant', 'i', 'think', 'you', 'will', 'be', 'glad', 'to', 'have', 'seen', 'this', 'extraordinary', 'and', 'unusual', 'work', 'of', 'artbr', 'br', 'on', 'a', 'scale', 'of', '1', 'to', '10', 'id', 'give', 'it', 'about', 'an', '875', 'the', 'only', 'reason', 'i', 'shy', 'away', 'from', '9', 'is', 'that', 'it', 'is', 'a', 'mood', 'piece', 'if', 'you', 'are', 'in', 'the', 'mood', 'for', 'a', 'really', 'artistic', 'very', 'romantic', 'film', 'then', 'its', 'a', '10', 'i', 'definitel

In [None]:
column_names = ["label", "text_a"]
dataset_train = GeneratorDataset(source = SentimentDataset(imdb_path, "train"), column_names=column_names, shuffle=False)
dataset_train, dataset_valid = dataset_train.split([0.7, 0.3])
dataset_test = GeneratorDataset(source = SentimentDataset(imdb_path, "test"), column_names=column_names, shuffle=False)

In [5]:
def process_dataset(dataset, tokenizer, pad_value, max_seq_len=500, shuffle=True):
    rename_columns = ["label", "input_ids"]
    
    # shuffle
    # transforms
    pad_op = PadTransform(max_seq_len, pad_value=pad_value)
    type_cast_op = transforms.TypeCast(mindspore.int32)
    
    # map dataset
    dataset = dataset.map(operations=[tokenizer, pad_op], input_columns="text_a")
    dataset = dataset.map(operations=[type_cast_op], input_columns="label")
    # rename dataset
    dataset = dataset.rename(input_columns=column_names, output_columns=rename_columns)

    return dataset

In [26]:
def process_dataset(source, tokenizer, pad_value, max_seq_len=64, batch_size=32, shuffle=True):
    column_names = ["label", "text_a"]
    rename_columns = ["label", "input_ids"]
    
    dataset = GeneratorDataset(source, column_names=column_names, shuffle=shuffle)
    # transforms
    pad_op = PadTransform(max_seq_len, pad_value=pad_value)
    type_cast_op = transforms.TypeCast(mindspore.int32)
    
    # map dataset
    dataset = dataset.map(operations=[tokenizer, pad_op], input_columns="text_a")
    dataset = dataset.map(operations=[type_cast_op], input_columns="label")
    # rename dataset
    dataset = dataset.rename(input_columns=column_names, output_columns=rename_columns)
    # batch dataset
    dataset = dataset.batch(batch_size)

    return dataset

In [6]:
# tokenizer
gpt_tokenizer = GPTTokenizer.from_pretrained('openai-gpt')

# add sepcial token: <PAD>
special_tokens_dict = {"pad_token": "<pad>"}
num_added_toks = gpt_tokenizer.add_special_tokens(special_tokens_dict)
pad_value = gpt_tokenizer.token_to_id("<pad>")

In [7]:
dataset_train = process_dataset(SentimentDataset(imdb_path, "train"), gpt_tokenizer, pad_value)
# split train dataset into train and valid datasets
dataset_train, dataset_valid = dataset_train.split([0.7, 0.3])
dataset_test = process_dataset(SentimentDataset(imdb_path, "test"), gpt_tokenizer, pad_value, shuffle=False)



In [24]:
# batch dataset
batch_size = 32
dataset_train = dataset_train.batch(batch_size)
dataset_valid = dataset_valid.batch(batch_size)
dataset_test = dataset_test.batch(batch_size)

In [29]:
# show dataset
for label, input_ids in dataset_test.create_tuple_iterator():
    print(f"label = {label}")
    print(f"input ids = {input_ids}")
    break

label = [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
input ids = [[  293   244   259 ...   256   249   256]
 [  293   244   259 ...   256  1532  3414]
 [  293   244   259 ...   256   987   256]
 ...
 [  293   244   259 ...   256 23880   256]
 [  293   244   259 ...   256 16574   256]
 [  293   244   259 ...   256  9469   256]]


In [33]:
from mindnlp.models import GPTForSequenceClassification
from mindnlp._legacy.amp import auto_mixed_precision

# set bert config and define parameters for training
model = GPTForSequenceClassification.from_pretrained('openai-gpt', num_labels=2)
model.pad_token_id = pad_value
model = auto_mixed_precision(model, 'O1')

loss = nn.CrossEntropyLoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=2e-5)

metric = Accuracy()

# define callbacks to save checkpoints
ckpoint_cb = CheckpointCallback(save_path='checkpoint', ckpt_name='sentiment_model', epochs=1, keep_checkpoint_max=2)
best_model_cb = BestModelCallback(save_path='checkpoint', auto_load=True)

trainer = Trainer(network=model, train_dataset=dataset_train,
                  eval_dataset=dataset_valid, metrics=metric,
                  epochs=10, loss_fn=loss, optimizer=optimizer, callbacks=[ckpoint_cb, best_model_cb],
                  jit=True)



In [34]:
# start training
trainer.run(tgt_columns="label")

The train will start from the checkpoint saved in 'checkpoint'.


Epoch 0:   0%|          | 0/547 [01:48<?, ?it/s]


TypeError: Multiply values for specific argument: axis

----------------------------------------------------
- The Traceback of Net Construct Code:
----------------------------------------------------
The function call stack (See file '/home/ma-user/work/rank_0/om/analyze_fail.dat' for more details. Get instructions about `analyze_fail.dat` at https://www.mindspore.cn/search?inputValue=analyze_fail.dat):
# 0 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/engine/trainer.py(143)
            if check_gradients:
# 1 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/engine/trainer.py(141)
            (loss, _), grads = grad_fn(inputs, labels)
                               ^
# 2 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindspore/ops/functional.py(454)
        if grad_position is None:
# 3 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindspore/ops/functional.py(453)
        res = aux_fn(*args)
              ^
# 4 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindspore/ops/functional.py(435)
        outputs = fn(*args)
                  ^
# 5 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/engine/trainer.py(112)
            logits = network(*inputs)
                     ^
# 6 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(551)
        if self.pad_token_id is None and batch_size != 1:
# 7 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/engine/trainer.py(112)
            logits = network(*inputs)
                     ^
# 8 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(576)
        if loss is not None:
# 9 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(566)
        output = (pooled_logits,) + transformer_outputs[1:]
                                    ^
# 10 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(302)
        if input_ids is not None and inputs_embeds is not None:
# 11 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(304)
        if input_ids is not None:
# 12 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(533)
        transformer_outputs = self.transformer(
                              ^
# 13 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(340)
        for i, block in enumerate(self.h):
# 14 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(533)
        transformer_outputs = self.transformer(
                              ^
# 15 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(344)
            outputs = block(hidden_states, attention_mask, head_mask[i])
                      ^
# 16 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(214)
        output_attn = self.attn(
# 17 In file /home/ma-user/anaconda3/envs/MindSpore/lib/python3.7/site-packages/mindnlp/models/gpt/gpt.py(185)
        query, key, value = ops.split(x, self.split_size, axis=2)
                            ^

----------------------------------------------------
- C++ Call Stack: (For framework developers)
----------------------------------------------------
mindspore/core/ir/func_graph_extends.cc:153 GenerateKwParams


In [None]:
evaluator = Evaluator(network=model, eval_dataset=dataset_test, metrics=metric)
evaluator.run(tgt_columns="label")