# 数据准备

## 基础下载模块

In [1]:
import os
import logging
import shutil
import requests
import tempfile
from tqdm import tqdm
from typing import IO
from pathlib import Path

cache_dir = Path.home() / '.mindspore_examples'

def http_get(url: str, temp_file:IO):
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = tqdm(unit='B', total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk:
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()

def download(file_name:str, url: str):
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    cache_path = os.path.join(cache_dir, file_name)
    cache_exist = os.path.exists(cache_path)
    if not cache_exist:
        with tempfile.NamedTemporaryFile() as temp_file:
            http_get(url, temp_file)
            temp_file.flush()
            temp_file.seek(0)
            logging.info(f"copying {temp_file.name} to cache at {cache_path}")
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)
    return cache_path

In [2]:
imdb_path = download('aclImdb_v1.tar.gz', 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')

## 加载IMDB数据集

In [3]:
import tarfile
import mindspore.dataset as dataset

def load_imdb(imdb_path):
    imdb_extract_path = os.path.join(cache_dir, 'aclImdb')
    if not os.path.exists(imdb_extract_path):
        tar = tarfile.open(imdb_path)
        tar.extractall(path=cache_dir)
    imdb_train = dataset.IMDBDataset(imdb_extract_path, usage="train", shuffle=True)
    imdb_test = dataset.IMDBDataset(imdb_extract_path, usage="test", shuffle=False)
    return imdb_train, imdb_test

In [4]:
imdb_train, imdb_test = load_imdb(imdb_path)

## 下载预训练词向量

In [5]:
glove_path = download('glove.6B.zip', 'https://nlp.stanford.edu/data/glove.6B.zip')

In [6]:
import zipfile
import numpy as np

def load_glove(glove_path):
    glove_100d_path = os.path.join(cache_dir, 'glove.6B.100d.txt')
    if not os.path.exists(glove_100d_path):
        glove_zip = zipfile.ZipFile(glove_path)
        glove_zip.extractall(cache_dir)

    embeddings = []
    tokens = []
    with open(glove_100d_path, encoding='utf-8') as gf:
        for glove in gf:
            word, embedding = glove.split(maxsplit=1)
            tokens.append(word)
            embeddings.append(np.fromstring(embedding, dtype=np.float32, sep=' '))
    # add embeddings for <unk> and <pad>
    embeddings.append(np.random.randn(100))
    embeddings.append(np.zeros((100,), np.float32))
    
    vocab = dataset.text.Vocab.from_list(tokens, special_tokens=["<unk>", "<pad>"], special_first=False)
    embeddings = np.array(embeddings).astype(np.float32)
    return vocab, embeddings

In [7]:
vocab, embeddings = load_glove(glove_path)

In [8]:
idx = vocab.tokens_to_ids('the')
embedding = embeddings[idx]
idx, embedding

(0,
 array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
        -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
         0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
        -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
         0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
        -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
         0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
         0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
        -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
        -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
        -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
        -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
        -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
        -1.2526  ,  0.071624,  0.7

## 数据集预处理

In [9]:
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [10]:
import string
import six
import spacy
import mindspore

spacy_tokenizer = spacy.load('en_core_web_sm')

def tokenizer(line):
    line = line.replace("<br />", ' ').lower()
    tokens = spacy_tokenizer(line)
    return [i.text for i in tokens]

def expand_dim(label):
    return np.expand_dims(np.array(label), -1)

tokenizer_op = dataset.text.PythonTokenizer(tokenizer)
lookup_op = dataset.text.Lookup(vocab, unknown_token='<unk>')
pad_op = dataset.transforms.c_transforms.PadEnd([500], pad_value=vocab.tokens_to_ids('<pad>'))
type_cast_op = dataset.transforms.c_transforms.TypeCast(mindspore.float32)

In [11]:
imdb_train = imdb_train.map(operations=[tokenizer_op, lookup_op, pad_op],  input_columns=['text'])
imdb_train = imdb_train.map(operations=[type_cast_op, expand_dim],  input_columns=['label'])

imdb_test = imdb_test.map(operations=[tokenizer_op, lookup_op, pad_op],  input_columns=['text'])

In [12]:
import mindspore
import mindspore.nn as nn
import mindspore.numpy as mnp
import mindspore.ops as ops
from mindspore import Tensor

In [13]:
class RNN(nn.Cell):
    def __init__(self, embeddings, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):
        super().__init__()
        vocab_size, embedding_dim = embeddings.shape
        self.embedding = nn.Embedding(vocab_size, embedding_dim, embedding_table=Tensor(embeddings), padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout,
                           batch_first=True)
        self.fc = nn.Dense(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(1 - dropout)
        self.sigmoid = ops.Sigmoid()

    def construct(self, inputs):
        # inputs: (batch, seq_length)
        embedded = self.dropout(self.embedding(inputs))
        # embedded: (batch, seq_length, embedding_dim)
        outputs, (hidden, cell) = self.rnn(embedded)
        #outputs: (batch, seq_length, hidden_dim * num_directions)
        #hidden: (num_layers * num_directions, batch_size, hidden_dim)
        #cell: (num_layers * num_directions, batch_size, hidden_dim)
        hidden = self.dropout(mnp.concatenate((hidden[-2,:,:], hidden[-1,:,:]), axis = 1))
        output = self.fc(hidden)
        return self.sigmoid(output)

In [14]:
hidden_size = 256
output_size = 1
num_layers = 2
bidirectional = True
dropout = 0.5
pad_idx = vocab.tokens_to_ids('<pad>')

model = RNN(embeddings, hidden_size, output_size, num_layers, bidirectional, dropout, pad_idx)
loss = nn.BCELoss(reduction='mean')
model_with_loss = nn.WithLossCell(model, loss)
optimizer = nn.Adam(model.trainable_params())
train_one_step = nn.TrainOneStepCell(model_with_loss, optimizer)

In [15]:
imdb_train = imdb_train.batch(64)

In [18]:
train_one_step.compile(*next(imdb_train.create_tuple_iterator()))

In [19]:
def train_one_epoch(model, train_dataset, epoch=0):
    total = train_dataset.get_dataset_size()
    loss_total = 0
    step_total = 0
    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in train_dataset.create_tuple_iterator():
            loss = train_one_step(i[0], i[1])
            loss_total += loss.asnumpy()
            step_total += 1
            t.set_postfix(loss=loss_total/step_total)
            t.update(1)

In [None]:
train_one_epoch(train_one_step, imdb_train)

Epoch 0:  33%|██████████████████████████████████████████████████████▉                                                                                                                 | 128/391 [12:17<23:09,  5.28s/it, loss=0.664]