In [11]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data

import Functions as d2l

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda')

DATA_ROOT = "./Datasets/"


In [12]:
fname = os.path.join(DATA_ROOT, 'aclImdb_v1.tar.gz')
if not os.path.exists(os.path.join(DATA_ROOT, 'aclImdb')):
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

In [18]:
###读取数据
from tqdm import tqdm

def read_imdb(folder='train', data_root = './Datasets/aclImdb'):
    data = []
    for label in ['pos','neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n','').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data = read_imdb('train')
test_data = read_imdb('test')

100%|██████████| 12500/12500 [00:34<00:00, 361.36it/s]
100%|██████████| 12500/12500 [00:34<00:00, 365.69it/s]
100%|██████████| 12500/12500 [00:34<00:00, 363.62it/s]
100%|██████████| 12500/12500 [00:34<00:00, 363.84it/s]


## 预处理数据

In [24]:
## 基于空格进行分词
def get_tokenized_imdb(data):
    # data: list of [string, label]
    def tokenizer(text):
        return [token.lower() for token in text.split(' ')]
    return [tokenizer(review) for review,_ in data]


### 基于分词结果创造词典
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab_imdb(train_data)
'# words in vocab:',len(vocab)

('# words in vocab:', 46152)

In [33]:
vocab.stoi['pad']

8366

In [34]:
def preprocess_imdb(data, vocab):
    max_len = 500  # 将每条评论通过截断或者补0，使得长度变成500
    
    def pad(x):
        return x[:max_len] if len(x) > max_len else x + [0] * (max_len - len(x)) 
    
    tokenized_data = get_tokenized_imdb(data)  ## 基于空格进行分词
    features = torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [66]:
batch_size = 128
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

for x,y in train_iter:
    print('x',x.shape, 'y',y.shape)
    break
'#batches:',len(train_iter)

x torch.Size([128, 500]) y torch.Size([128])


('#batches:', 196)

### 使用RNN

In [67]:
class BiRNN(nn.Module):
    def __init__(self, vocab, embed_size, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.encoder = nn.LSTM(input_size = embed_size,
                               hidden_size = num_hiddens,
                               num_layers = num_layers,
                               bidirectional=True)
        self.decoder = nn.Linear(4*num_hiddens, 2)
        
    def forward(self, inputs):
        # inputs的形状是(batch, 500词),LSTM需要将序列长度作为第一维，所以要进行转置。
        embeddings = self.embedding(inputs.permute(1, 0)) # 输出形状为(500词, 批量大小, 词向量维度)
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings) # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
        encoding = torch.cat((outputs[0], outputs[-1]), -1)
        outs = self.decoder(encoding)
        return outs
    
    
embed_size = 100
num_hiddens = 100
num_layers = 2


net = BiRNN(vocab, embed_size, num_hiddens, num_layers)

### 加载词向量

In [68]:
glove_vocab = Vocab.GloVe(name='6B', dim =100, cache=os.path.join(DATA_ROOT, 'glove'))

def load_pretrained_embedding(words, pretrained_vocab):
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0 # out of vocabulary
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("有{}个超出词表的词".format(oov_count))
    return embed

net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False 


有21202个超出词表的词


In [69]:
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = d2l.evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))


In [70]:
lr, num_epochs = 0.01, 50
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.6365, train acc 0.616, test acc 0.801, time 22.6 sec
epoch 2, loss 0.1963, train acc 0.824, test acc 0.820, time 22.6 sec
epoch 3, loss 0.1077, train acc 0.861, test acc 0.860, time 22.6 sec
epoch 4, loss 0.0715, train acc 0.881, test acc 0.868, time 22.5 sec
epoch 5, loss 0.0482, train acc 0.901, test acc 0.858, time 22.7 sec
epoch 6, loss 0.0360, train acc 0.914, test acc 0.862, time 22.7 sec
epoch 7, loss 0.0256, train acc 0.930, test acc 0.863, time 22.7 sec
epoch 8, loss 0.0189, train acc 0.942, test acc 0.869, time 22.5 sec
epoch 9, loss 0.0146, train acc 0.950, test acc 0.863, time 22.6 sec
epoch 10, loss 0.0162, train acc 0.935, test acc 0.855, time 22.8 sec
epoch 11, loss 0.0133, train acc 0.943, test acc 0.856, time 22.6 sec
epoch 12, loss 0.0097, train acc 0.956, test acc 0.855, time 22.6 sec
epoch 13, loss 0.0073, train acc 0.966, test acc 0.853, time 22.6 sec
epoch 14, loss 0.0062, train acc 0.969, test acc 0.852, time 22.5 sec
epoch 15, l

In [72]:
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'


In [77]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'not', 'good']) ## the result is wrong.

'positive'