# 数据准备

## 基础下载模块

In [1]:
import os
import logging
import shutil
import requests
import tempfile
from tqdm import tqdm
from typing import IO
from pathlib import Path

cache_dir = Path.home() / '.mindspore_examples'

def http_get(url: str, temp_file:IO):
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = tqdm(unit='B', total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk:
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()

def download(file_name:str, url: str):
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    cache_path = os.path.join(cache_dir, file_name)
    cache_exist = os.path.exists(cache_path)
    if not cache_exist:
        with tempfile.NamedTemporaryFile() as temp_file:
            http_get(url, temp_file)
            temp_file.flush()
            temp_file.seek(0)
            logging.info(f"copying {temp_file.name} to cache at {cache_path}")
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)
    return cache_path

## 加载IMDB数据集

In [2]:
import re
import six
import string
import tarfile

class IMDBData():
    label_map = {
        "pos": 1,
        "neg": 0
    }
    def __init__(self, path, mode="train"):
        self.mode = mode
        self.path = path
        self.docs, self.labels = [], []

        self._load("pos")
        self._load("neg")

    def _load(self, label):
        pattern = re.compile(r"aclImdb/{}/{}/.*\.txt$".format(self.mode, label))
        # load tarfile to memory
        with tarfile.open(self.path) as tarf:
            tf = tarf.next()
            while tf is not None:
                if bool(pattern.match(tf.name)):
                    self.docs.append(str(tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
                        .translate(None, six.b(string.punctuation)).lower(
                        )).split())
                    self.labels.append([self.label_map[label]])
                tf = tarf.next()               

    def __getitem__(self, idx):
        return self.docs[idx], self.labels[idx]
    
    def __len__(self):
        return len(self.docs)

In [3]:
imdb_path = download('aclImdb_v1.tar.gz', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/aclImdb_v1.tar.gz')

In [4]:
imdb_train = IMDBData(imdb_path, 'train')

In [5]:
len(imdb_train)

25000

In [6]:
import mindspore.dataset as dataset

def load_imdb(imdb_path):
    imdb_extract_path = os.path.join(cache_dir, 'aclImdb')
    if not os.path.exists(imdb_extract_path):
        tar = tarfile.open(imdb_path)
        tar.extractall(path=cache_dir)
    imdb_train = dataset.GeneratorDataset(IMDBData(imdb_path, "train"), column_names=["text", "label"])
    imdb_test = dataset.GeneratorDataset(IMDBData(imdb_path, "test"), column_names=["text", "label"])
    return imdb_train, imdb_test

In [7]:
imdb_train, imdb_test = load_imdb(imdb_path)

## 下载预训练词向量

In [8]:
glove_path = download('glove.6B.zip', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/glove.6B.zip')

In [9]:
import zipfile
import numpy as np

def load_glove(glove_path):
    glove_100d_path = os.path.join(cache_dir, 'glove.6B.100d.txt')
    if not os.path.exists(glove_100d_path):
        glove_zip = zipfile.ZipFile(glove_path)
        glove_zip.extractall(cache_dir)

    embeddings = []
    tokens = []
    with open(glove_100d_path, encoding='utf-8') as gf:
        for glove in gf:
            word, embedding = glove.split(maxsplit=1)
            tokens.append(word)
            embeddings.append(np.fromstring(embedding, dtype=np.float32, sep=' '))
    # add embeddings for <unk> and <pad>
    embeddings.append(np.random.rand(100))
    embeddings.append(np.zeros((100,), np.float32))
    
    vocab = dataset.text.Vocab.from_list(tokens, special_tokens=["<unk>", "<pad>"], special_first=False)
    embeddings = np.array(embeddings).astype(np.float32)
    return vocab, embeddings

In [10]:
vocab, embeddings = load_glove(glove_path)

In [11]:
idx = vocab.tokens_to_ids('the')
embedding = embeddings[idx]
idx, embedding

(0,
 array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
        -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
         0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
        -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
         0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
        -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
         0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
         0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
        -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
        -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
        -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
        -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
        -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
        -1.2526  ,  0.071624,  0.7

## 数据集预处理

In [12]:
import mindspore
lookup_op = dataset.text.Lookup(vocab, unknown_token='<unk>')
pad_op = dataset.transforms.c_transforms.PadEnd([500], pad_value=vocab.tokens_to_ids('<pad>'))
type_cast_op = dataset.transforms.c_transforms.TypeCast(mindspore.float32)

In [13]:
imdb_train = imdb_train.map(operations=[lookup_op, pad_op],  input_columns=['text'])
imdb_train = imdb_train.map(operations=[type_cast_op],  input_columns=['label'])

imdb_train, imdb_valid = imdb_train.split([0.7, 0.3])

imdb_test = imdb_test.map(operations=[lookup_op, pad_op],  input_columns=['text'])
imdb_test = imdb_test.map(operations=[type_cast_op],  input_columns=['label'])



In [14]:
imdb_train = imdb_train.batch(64, drop_remainder=True)
imdb_valid = imdb_valid.batch(64, drop_remainder=True)

In [15]:
import mindspore
import mindspore.nn as nn
import mindspore.numpy as mnp
import mindspore.ops as ops
from mindspore import Tensor

In [16]:
class RNN(nn.Cell):
    def __init__(self, embeddings, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()
        vocab_size, embedding_dim = embeddings.shape
        self.embedding = nn.Embedding(vocab_size, embedding_dim, embedding_table=Tensor(embeddings), padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Dense(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(1 - dropout)
        self.sigmoid = ops.Sigmoid()

    def construct(self, inputs):
        # inputs: (batch, seq_length)
        embedded = self.dropout(self.embedding(inputs))
        # embedded: (batch, seq_length, embedding_dim)
        outputs, (hidden, cell) = self.rnn(embedded)
        #outputs: (batch, seq_length, hidden_dim * num_directions)
        #hidden: (num_layers * num_directions, batch_size, hidden_dim)
        #cell: (num_layers * num_directions, batch_size, hidden_dim)
        hidden = self.dropout(mnp.concatenate((hidden[-2,:,:], hidden[-1,:,:]), axis = 1))
        output = self.fc(hidden)
        return self.sigmoid(output)

In [17]:
hidden_size = 256
output_size = 1
num_layers = 2
bidirectional = True
dropout = 0.5
pad_idx = vocab.tokens_to_ids('<pad>')

net = RNN(embeddings, hidden_size, output_size, num_layers, bidirectional, dropout, pad_idx)
loss = nn.BCELoss(reduction='mean')
net_with_loss = nn.WithLossCell(net, loss)
optimizer = nn.Adam(net.trainable_params())
train_one_step = nn.TrainOneStepCell(net_with_loss, optimizer)

In [18]:
def train_one_epoch(model, train_dataset, epoch=0):
    model.set_train()
    total = train_dataset.get_dataset_size()
    loss_total = 0
    step_total = 0
    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in train_dataset.create_tuple_iterator():
            loss = model(*i)
            loss_total += loss.asnumpy()
            step_total += 1
            t.set_postfix(loss=loss_total/step_total)
            t.update(1)

In [19]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = np.around(preds)
    correct = (rounded_preds == y).astype(np.float32) #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [20]:
def evaluate(model, test_dataset, criterion, epoch=0):
    total = test_dataset.get_dataset_size()
    epoch_loss = 0
    epoch_acc = 0
    step_total = 0
    model.set_train(False)

    iterator = test_dataset.create_tuple_iterator()
    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in test_dataset.create_tuple_iterator():
            predictions = model(i[0])
            loss = criterion(predictions, i[1])
            epoch_loss += loss.asnumpy()

            acc = binary_accuracy(predictions.asnumpy(), i[1].asnumpy())
            epoch_acc += acc

            step_total += 1
            t.set_postfix(loss=epoch_loss/step_total, acc=epoch_acc/step_total)
            t.update(1)

    return epoch_loss / total

In [21]:
from mindspore import save_checkpoint

num_epochs = 5
best_valid_loss = float('inf')
ckpt_file_name = os.path.join(cache_dir, 'sentiment-analysis.ckpt')

for epoch in range(num_epochs):
    train_one_epoch(train_one_step, imdb_train, epoch)
    valid_loss = evaluate(net, imdb_valid, loss, epoch)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        save_checkpoint(net, ckpt_file_name)

Epoch 0: 100%|██████████████████████████████████████████| 273/273 [00:51<00:00,  5.28it/s, loss=0.675]
Epoch 0: 100%|███████████████████████████████| 117/117 [00:44<00:00,  2.65it/s, acc=0.572, loss=0.674]
Epoch 1: 100%|██████████████████████████████████████████| 273/273 [00:44<00:00,  6.14it/s, loss=0.642]
Epoch 1: 100%|███████████████████████████████| 117/117 [00:43<00:00,  2.71it/s, acc=0.703, loss=0.574]
Epoch 2: 100%|██████████████████████████████████████████| 273/273 [00:44<00:00,  6.13it/s, loss=0.473]
Epoch 2: 100%|███████████████████████████████| 117/117 [00:43<00:00,  2.72it/s, acc=0.844, loss=0.363]
Epoch 3: 100%|██████████████████████████████████████████| 273/273 [00:44<00:00,  6.14it/s, loss=0.332]
Epoch 3: 100%|███████████████████████████████| 117/117 [00:42<00:00,  2.73it/s, acc=0.901, loss=0.248]
Epoch 4: 100%|███████████████████████████████████████████| 273/273 [00:44<00:00,  6.10it/s, loss=0.28]
Epoch 4: 100%|███████████████████████████████| 117/117 [00:43<00:00,  2.7

## 加载Checkpoint

In [22]:
from mindspore import load_checkpoint, load_param_into_net

param_dict = load_checkpoint(ckpt_file_name)
load_param_into_net(net, param_dict)

Please set a unique name for the parameter 'Parameter (name=Parameter, shape=(400002, 100), dtype=Float32, requires_grad=True)'.
Please set a unique name for the parameter 'Parameter (name=Parameter, shape=(1024, 100), dtype=Float32, requires_grad=True)'.
Please set a unique name for the parameter 'Parameter (name=Parameter, shape=(1024, 100), dtype=Float32, requires_grad=True)'.
Please set a unique name for the parameter 'Parameter (name=Parameter, shape=(1024, 512), dtype=Float32, requires_grad=True)'.
Please set a unique name for the parameter 'Parameter (name=Parameter, shape=(1024, 512), dtype=Float32, requires_grad=True)'.
Please set a unique name for the parameter 'Parameter (name=Parameter, shape=(1024, 256), dtype=Float32, requires_grad=True)'.
Please set a unique name for the parameter 'Parameter (name=Parameter, shape=(1024, 256), dtype=Float32, requires_grad=True)'.
Please set a unique name for the parameter 'Parameter (name=Parameter, shape=(1024, 256), dtype=Float32, requ

[]

In [23]:
imdb_test = imdb_test.batch(64)
evaluate(net, imdb_test, loss)

Epoch 0: 100%|███████████████████████████████| 391/391 [00:35<00:00, 11.11it/s, acc=0.868, loss=0.337]


0.33658065644981305

## 自定义输入

In [24]:
def predict_sentiment(model, vocab, sentence):
    model.set_train(False)
    tokenized = sentence.lower().split()
    indexed = vocab.tokens_to_ids(tokenized)
    tensor = mindspore.Tensor(indexed, mindspore.int32)
    tensor = tensor.expand_dims(0)
    prediction = model(tensor)
    return prediction.asnumpy()

In [25]:
predict_sentiment(net, vocab, "This film is terrible")

array([[0.02964271]], dtype=float32)

In [26]:
predict_sentiment(net, vocab, "This film is great")

array([[0.9793879]], dtype=float32)