In [2]:
import time
from collections import Counter

import torchtext.vocab as Vocab
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torch import nn, optim
import torch
import torch.utils.data as Data
import torch.nn.functional as F

from utils import get_vocab_imdb, get_tokenized_imdb, get_tokenizer
from utils import evaluate, epoch_time
from utils import train as trainer
from utils import preprocess_imdb

# 数据预处理

In [3]:
train, test = IMDB(split=('train', 'test'))

In [4]:
train, test = list(train), list(test)

In [5]:
batch_size = 64
vocab = get_vocab_imdb(train)

In [6]:
train_set = Data.TensorDataset(*preprocess_imdb(train, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test, vocab))

In [7]:
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size, shuffle=True)

# 加载词向量

In [8]:
cache_dir = "./datasets/glove"
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=cache_dir)

In [9]:
def load_pretrained_embedding(words, pretrained_vocab):
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0
    for i, word in  enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print('There are %d oov words.' % oov_count)
    return embed

In [10]:
glove_100 = load_pretrained_embedding(vocab.get_itos(), glove_vocab)

There are 14719 oov words.


# TextCNN model

In [11]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        # 不参与训练的嵌入层
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        # 时序最大池化层没有权重，所以可以共用一个实例
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.convs = nn.ModuleList()
        # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(
                nn.Conv1d(in_channels=2*embed_size, out_channels=c,kernel_size=k)
            )
    
    def forward(self, inputs):
        # 将两个嵌入层的输出按词向量连接（N, L, C）
        embeddings = torch.cat((self.embedding(inputs), self.constant_embedding(inputs)), dim=2)
        # con1d的输入格式：(N, C_in, L)
        embeddings = embeddings.permute(0, 2, 1)
        # 全局池化，展开，然后连接
#         encoding = torch.cat(
#             [self.pool(conv(embeddings)).squeeze(-1) for conv in self.convs],  # 未加 F.relu
#             dim=1
#         )
        encoding = torch.cat(
            [self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs],
            dim=1
        )

        # 应用丢弃法后，使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

# 初始化模型和参数

In [15]:
Vocab_length = len(vocab)
Embedding_dim = 100
Output_dim = 2
lr = 1e-3
Epochs = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
model = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)
model.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.get_itos(), glove_vocab)
)
model.embedding.weight.requires_grad = False  # 该嵌入层不需要训练
model = model.to(device)

There are 14719 oov words.


In [16]:
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)
loss = nn.CrossEntropyLoss()

# 训练模型

In [17]:
best_valid_loss = float('inf')

for epoch in range(Epochs):
    start_time = time.time()
    train_loss, train_acc = trainer(
        model, 
        train_iter, 
        optimizer, 
        loss,
        device
    )
    valid_loss, valid_acc = evaluate(model, test_iter, loss,device)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './models/rnn-best-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 13s
	Train Loss: 0.556 | Train Acc: 71.25%
	 Val. Loss: 0.418 |  Val. Acc: 82.32%
Epoch: 02 | Epoch Time: 0m 10s
	Train Loss: 0.346 | Train Acc: 86.07%
	 Val. Loss: 0.338 |  Val. Acc: 86.28%
Epoch: 03 | Epoch Time: 0m 9s
	Train Loss: 0.256 | Train Acc: 90.15%
	 Val. Loss: 0.297 |  Val. Acc: 88.03%
Epoch: 04 | Epoch Time: 0m 9s
	Train Loss: 0.193 | Train Acc: 92.95%
	 Val. Loss: 0.297 |  Val. Acc: 88.35%
Epoch: 05 | Epoch Time: 0m 9s
	Train Loss: 0.150 | Train Acc: 94.64%
	 Val. Loss: 0.308 |  Val. Acc: 88.35%
