In [1]:
import mindspore
from mindspore import nn
from mindspore.common.initializer import initializer

import mindnlp
from mindnlp import Vocab
from mindspore.dataset.transforms import PadEnd
from mindnlp.transforms import BasicTokenizer, PadTransform, Lookup
from mindnlp.modules import Glove, StaticLSTM
from mindnlp.metrics import accuracy_fn

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
imdb_ds = mindnlp.load_dataset('imdb', split=['train', 'test'], shuffle=True)
imdb_train, imdb_test = imdb_ds['train'], imdb_ds['test']

Using the latest cached version of the module from /home/daiyuxin/.cache/huggingface/modules/datasets_modules/datasets/imdb/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0 (last modified on Sun Oct  8 16:06:39 2023) since it couldn't be found locally at imdb., or remotely on the Hugging Face Hub.
Found cached dataset imdb (/data1/lvyufeng/mindnlp/examples/classification/.mindnlp/datasets/imdb/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 419.56it/s]


In [3]:
tokenizer = BasicTokenizer(lower_case=True)

In [4]:
vocab = Vocab.from_pretrained(name="glove.6B.100d")
lookup_op = Lookup(vocab, unk_token='<unk>')

In [5]:
max_length = 256
pad_op = PadTransform(max_length, pad_value=vocab('<pad>'))

In [6]:
imdb_train = imdb_train.map([tokenizer, lookup_op, pad_op], 'text')
imdb_test = imdb_test.map([tokenizer, lookup_op, pad_op], 'text')

In [7]:
batch_size = 64

imdb_train = imdb_train.batch(batch_size)
imdb_test = imdb_test.batch(batch_size)

In [8]:
imdb_train, imdb_valid = imdb_train.split([0.7, 0.3])



In [9]:
import math
import mindspore as ms
import mindspore.nn as nn
import mindspore.ops as ops
from mindspore.common.initializer import Uniform, HeUniform

class RNN(nn.Module):
    def __init__(self, embedding, hidden_dim, output_dim, n_layers,
                 bidirectional):
        super().__init__()
        embedding_dim = embedding._embed_dim
        self.embedding = embedding
        self.rnn = StaticLSTM(embedding_dim,
                              hidden_dim,
                              num_layers=n_layers,
                              bidirectional=bidirectional,
                              batch_first=True,
                              dropout=0.5)
        self.fc = nn.Dense(hidden_dim * 2, output_dim)

    def construct(self, inputs):
        embedded = self.embedding(inputs)
        _, (hidden, _) = self.rnn(embedded)
        hidden = ops.concat((hidden[-2, :, :], hidden[-1, :, :]), axis=1)
        output = self.fc(hidden)
        return output

In [10]:
# load embedding and vocab
embedding = Glove.from_pretrained('6B', 100, special_tokens=["<unk>", "<pad>"])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 822M/822M [02:38<00:00, 5.43MB/s]


In [11]:
hidden_size = 256
output_size = 2
num_layers = 2
bidirectional = True
lr = 5e-4

model = RNN(embedding, hidden_size, output_size, num_layers, bidirectional)
loss_fn = nn.CrossEntropyLoss()
optimizer = nn.Adam(model.trainable_params(), learning_rate=lr)

In [12]:
def initialize_weights(m):
    if isinstance(m, nn.Dense):
        m.weight.set_data(initializer('xavier_normal', m.weight.shape, m.weight.dtype))
        m.bias.set_data(initializer('zeros', m.bias.shape, m.bias.dtype))
    elif isinstance(m, StaticLSTM):
        for name, param in m.parameters_and_names():
            if 'bias' in name:
                param.set_data(initializer('zeros', param.shape, param.dtype))
            elif 'weight' in name:
                param.set_data(initializer('orthogonal', param.shape, param.dtype))

In [13]:
model.apply(initialize_weights)

RNN<
  (embedding): Glove<
    (dropout_layer): Dropout<>
    >
  (rnn): StaticLSTM<
    (rnn): MultiLayerRNN<
      (cell_list): CellList<
        (0): SingleLSTMLayer_GPU<>
        (1): SingleLSTMLayer_GPU<>
        >
      (dropout): Dropout<p=0.5>
      >
    >
  (fc): Dense<input_channels=512, output_channels=2, has_bias=True>
  >

In [14]:
def forward_fn(data, label):
    logits = model(data)
    loss = loss_fn(logits, label)
    return loss

grad_fn = ms.value_and_grad(forward_fn, None, optimizer.parameters)

def train_step(data, label):
    loss, grads = grad_fn(data, label)
    optimizer(grads)
    return loss

def train_one_epoch(model, train_dataset, epoch=0):
    model.set_train()
    total = train_dataset.get_dataset_size()
    loss_total = 0
    step_total = 0
    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for data, label in train_dataset.create_tuple_iterator():
            loss = train_step(data, label.astype(mindspore.int32))
            loss_total += loss.asnumpy()
            step_total += 1
            t.set_postfix(loss=loss_total/step_total)
            t.update(1)


In [15]:
def evaluate(model, test_dataset, criterion, epoch=0):
    total = test_dataset.get_dataset_size()
    epoch_loss = 0
    epoch_acc = 0
    step_total = 0
    model.set_train(False)

    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in test_dataset.create_tuple_iterator():
            predictions = model(i[0])
            loss = criterion(predictions, i[1].astype(mindspore.int32))
            epoch_loss += loss.asnumpy()

            acc = accuracy_fn(predictions, i[1])
            epoch_acc += acc

            step_total += 1
            t.set_postfix(loss=epoch_loss/step_total, acc=epoch_acc/step_total)
            t.update(1)

    return epoch_loss / total

In [16]:
num_epochs = 5
best_valid_loss = float('inf')

for epoch in range(num_epochs):
    train_one_epoch(model, imdb_train, epoch)
    valid_loss = evaluate(model, imdb_valid, loss_fn, epoch)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        ms.save_checkpoint(model, './sentiment_analysis.ckpt')

Epoch 0: 100%|████████████████████████████████████████████████████████████████████████████████████| 274/274 [00:18<00:00, 14.58it/s, loss=0.654]
Epoch 0: 100%|█████████████████████████████████████████████████████████████████████████| 117/117 [00:06<00:00, 18.92it/s, acc=0.692, loss=0.586]
Epoch 1: 100%|████████████████████████████████████████████████████████████████████████████████████| 274/274 [00:14<00:00, 18.60it/s, loss=0.641]
Epoch 1: 100%|█████████████████████████████████████████████████████████████████████████| 117/117 [00:07<00:00, 15.82it/s, acc=0.696, loss=0.607]
Epoch 2: 100%|█████████████████████████████████████████████████████████████████████████████████████| 274/274 [00:25<00:00, 10.96it/s, loss=0.58]
Epoch 2: 100%|█████████████████████████████████████████████████████████████████████████| 117/117 [00:06<00:00, 19.08it/s, acc=0.628, loss=0.641]
Epoch 3: 100%|████████████████████████████████████████████████████████████████████████████████████| 274/274 [00:14<00:00, 18.76it/