# 导入库

In [29]:
import time
import random
from collections import Counter
import torch
from torch import nn, optim
import torch.utils.data as Data
from torch.nn.utils.rnn import pad_sequence
import torchtext
from torchtext.datasets import IMDB
import torchtext.vocab as Vocab
from torchtext.data.utils import get_tokenizer

In [10]:
SEED = 42
torch.manual_seed(SEED)
# 每次运行网络的时候算法和SEED是固定的，方便复现
torch.backends.cudnn.deterministic = True

#  数据预处理

In [69]:
train, test = IMDB(root='./datasets/', split=('train', 'test'))

In [12]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [50]:
train.description

'IMDB'

In [67]:
len(train), len(test)

(25000, 25000)

In [37]:
def get_vocab_imdb(data, min_freq=5):
    counter = Counter()
    for (label, line) in data:
        counter.update(tokenizer(line))
    vocab = Vocab.vocab(counter, min_freq=5)
    unk_token = '<unk>'
    vocab.insert_token(unk_token, 0)
    vocab.set_default_index(vocab[unk_token])
    return vocab

In [76]:
def get_tokenized_imdb(data):
    return [tokenizer(review) for (_, review) in data]

In [78]:
def preprocess_imdb(data, vocab):
    max_l = 500
    
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0] * (max_l-len(x))
    data = list(data)
    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor(
        [pad(vocab.lookup_indices(words)) for words in tokenized_data]
    )
    labels = torch.tensor([1 if score=='pos' else 0 for (score, _) in data])
    return features, labels

In [79]:
batch_size = 64
vocab = get_vocab_imdb(train)

In [81]:
train, test = list(train), list(test)
train_set = Data.TensorDataset(*preprocess_imdb(train, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test, vocab))
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size, shuffle=True)

# 创建模型

In [201]:
class RNN(nn.Module):
    def __init__(self, V, E, H, O):
        super(RNN, self).__init__()
        self.embedding = nn.Embedding(V, E)
        self.rnn = nn.RNN(E, H, batch_first=True)
        self.fc = nn.Linear(H, O)
        
    def forward(self, X):
        X = self.embedding(X)
        output, hidden = self.rnn(X)
        return self.fc(hidden)

# 训练模型

In [213]:
Vocab_length = len(vocab)
Embedding_dim = 100
Hidden_dim = 256
Output_dim = 2
Learning_rate = 1e-3
Epochs = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [203]:
model = RNN(Vocab_length, Embedding_dim, Hidden_dim, Output_dim)
optimizer = optim.SGD(model.parameters(), lr=Learning_rate)
loss = nn.CrossEntropyLoss()

In [204]:
model = model.to(device)

In [210]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0.0
    epoch_acc = 0.0
    model.train()
    for X, y in iterator:
        X = X.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        predictions = model(X).squeeze(0)
        l = loss(predictions, y)
        acc = (predictions.argmax(dim=1) == y).sum().cpu().item()/X.shape[0]
        l.backward()
        optimizer.step()
        epoch_loss += l.cpu().item()
        epoch_acc += acc
    return epoch_loss / len(iterator), epoch_acc/len(iterator)

In [211]:
def evaluate(model, iterator, loss):
    epoch_loss = 0.0
    epoch_acc = 0.0
    model.eval()  # 取消dropout，不重新计算batch normalization
    with torch.no_grad():  # 不计算梯度，节省内存和时间
        for X, y in iterator:
            X = X.to(device)
            y = y.to(device)
            predictions = model(X).squeeze(0)
            l = loss(predictions, y)
            acc = (predictions.argmax(dim=1) == y).sum().cpu().item()/X.shape[0]

            epoch_loss += l.cpu().item()
            epoch_acc += acc
    return epoch_loss / len(iterator), epoch_acc/len(iterator)

In [207]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [214]:
best_valid_loss = float('inf')

for epoch in range(Epochs):
    start_time = time.time()
    train_loss, train_acc = train(
        model, 
        train_iter, 
        optimizer, 
        loss)
    valid_loss, valid_acc = evaluate(model, test_iter, loss)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './models/rnn-best-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 14s
	Train Loss: 0.694 | Train Acc: 50.28%
	 Val. Loss: 0.694 |  Val. Acc: 50.38%


In [182]:
print(next(model.parameters()).is_cuda)

True
