In [50]:
import time
from collections import Counter

import torchtext.vocab as Vocab
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torch import nn, optim
import torch
import torch.utils.data as Data
import torch.nn.functional as F

from utils import get_vocab_imdb, get_tokenized_imdb, get_tokenizer
from utils import evaluate, epoch_time
from utils import train as trainer
from utils import preprocess_imdb

# 数据预处理

In [4]:
train, test = IMDB(split=('train', 'test'))

In [6]:
train, test = list(train), list(test)

In [9]:
batch_size = 64
vocab = get_vocab_imdb(train)

In [10]:
train_set = Data.TensorDataset(*preprocess_imdb(train, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test, vocab))

In [11]:
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size, shuffle=True)

# 加载词向量

In [12]:
cache_dir = "./datasets/glove"
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

In [13]:
def load_pretrained_embedding(words, pretrained_vocab):
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0
    for i, word in  enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print('There are %d oov words.' % oov_count)
    return embed

In [14]:
glove_100 = load_pretrained_embedding(vocab.get_itos(), glove_vocab)

There are 14719 oov words.


# FastText model

In [43]:
class FastText(nn.Module):
    def __init__(self, V, E, O):
        super().__init__()
        self.embedding = nn.Embedding(V, E)
        self.fc = nn.Linear(E, O)
        
    def forward(self, X):
        X = self.embedding(X)
        # 在特征维度进行全局平均池化，(B, L, E) ---> (B, E)
        pooled = F.avg_pool2d(X, (X.shape[1], 1)).squeeze(1)
        output = self.fc(pooled)
        
        return output

# 初始化模型和参数

In [44]:
Vocab_length = len(vocab)
Embedding_dim = 100
Output_dim = 2
Learning_rate = 1e-3
Epochs = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [45]:
model = FastText(Vocab_length, Embedding_dim, Output_dim)
model.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.get_itos(), glove_vocab)
)
model.embedding.weight.requires_grad = False  # 该嵌入层不需要训练
model = model.to(device)

There are 14719 oov words.


In [46]:
optimizer = optim.SGD(model.parameters(), lr=Learning_rate)
loss = nn.CrossEntropyLoss()

# 训练模型

In [51]:
best_valid_loss = float('inf')

for epoch in range(Epochs):
    start_time = time.time()
    train_loss, train_acc = trainer(
        model, 
        train_iter, 
        optimizer, 
        loss,
        device
    )
    valid_loss, valid_acc = evaluate(model, test_iter, loss,device)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './models/rnn-best-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 0s
	Train Loss: 0.692 | Train Acc: 53.29%
	 Val. Loss: 0.691 |  Val. Acc: 54.02%
Epoch: 02 | Epoch Time: 0m 0s
	Train Loss: 0.691 | Train Acc: 53.67%
	 Val. Loss: 0.691 |  Val. Acc: 54.46%
Epoch: 03 | Epoch Time: 0m 0s
	Train Loss: 0.691 | Train Acc: 54.14%
	 Val. Loss: 0.691 |  Val. Acc: 54.95%
Epoch: 04 | Epoch Time: 0m 0s
	Train Loss: 0.691 | Train Acc: 54.64%
	 Val. Loss: 0.691 |  Val. Acc: 55.51%
Epoch: 05 | Epoch Time: 0m 0s
	Train Loss: 0.691 | Train Acc: 55.26%
	 Val. Loss: 0.691 |  Val. Acc: 55.88%
