# 项目描述

利用IMDB电影评论数据做文本分类项目，分别利用FNN、TextCNN、TextRNN、TextRCNN、GRU+Attention。

# 数据预处理

In [0]:
import os
from google.colab import drive
drive.mount("/content/drive/")
os.chdir("/content/drive/My Drive")
!ls

In [0]:
import torch
from torchtext import data
from torchtext import vocab

text_field = data.Field(tokenize='spacy', lower=True, include_lengths=True,
                        fix_length=100)
label_field = data.Field(sequential=False, use_vocab=False, dtype=torch.long)
train, valid, test = data.TabularDataset.splits(path='',
                                                train='imdb-train.csv',
                                                validation='imdb-valid.csv',
                                                test='imdb-test.csv',
                                                format='csv', skip_header=True,
                                                fields=[('sentence', text_field), ('label', label_field)])

In [0]:
import torch.nn as nn

In [0]:
vec = vocab.Vectors(name='glove.6B.300d.txt')
text_field.build_vocab(train, valid, test, max_size=250000, vectors=vec,
                       unk_init=torch.Tensor.normal_)
label_field.build_vocab(train, valid, test)

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iter, valid_iter, test_iter = data.BucketIterator.splits((train, valid, test), batch_sizes=(64, 64, 64),
                                                               sort_key=lambda x: len(x.sentence),
                                                               sort_within_batch=True,
                                                               repeat=False, shuffle=True,
                                                               device=device)

# 模型训练函数

In [0]:
def train_fun(model, train_iter, dev_iter, num_epoch, opt, criterion, eva,
              out_model_file):
    model.train()
    loss_list = []
    best_dev_acc = 0.
    for epoch in range(num_epoch):
        total_loss = 0.
        for batch in train_iter:
            output = model(batch.sentence)
            loss = criterion(output, batch.label)
            opt.zero_grad()
            loss.backward()
            opt.step()
            total_loss += loss.item()
        loss_list.append(total_loss)
        if (epoch + 1) % 10 == 0:
            dev_acc = eva(model, dev_iter)
            print(f"Epoch: {epoch+1}/{num_epoch}. Total loss: {total_loss:.3f}. Validation Set Acc: {dev_acc:.3%}.")
            if dev_acc > best_dev_acc:
                best_dev_acc = dev_acc
                torch.save(model.state_dict(), out_model_file)
            else:
                if epoch + 1 > 60:
                    print("Early Stop!")
                    break
    return loss_list

# 模型评价函数

In [0]:
def eva(model, data_iter):
    correct, count = 0, 0
    with torch.no_grad():
        for batch in data_iter:
            pred = model(batch.sentence)
            pred = torch.argmax(pred, dim=-1)
            correct += (pred == batch.label).sum().item()
            count += len(pred)
    return correct / count

# 分类器1: TextFNN

In [0]:
import torch.nn as nn

class FNN(nn.Module):
    def __init__(self, embedding_dim, seq_len, hidden_size1, hidden_size2,
                 out_dim, pretrained_embed):
        super(FNN, self).__init__()
        self.embed = nn.Embedding.from_pretrained(pretrained_embed, freeze=True)
        self.proj = nn.Sequential(
            nn.Linear(embedding_dim * seq_len, hidden_size1),
            nn.ReLU(),
            nn.Linear(hidden_size1, hidden_size2),
            nn.ReLU(),
            nn.Linear(hidden_size2, out_dim)
        )
    
    def forward(self, x):
        x, _ = x # [seq, bs]
        x = self.embed(x) # [seq, bs, embed]
        x = x.permute(1, 0, 2) # [bs, seq, embed]
        x = x.reshape(x.shape[0], -1) # [bs, seq * embed]
        return self.proj(x)

In [0]:
# Hyperparameters
embedding_dim = 300
out_dim = 2
pretrained_embed = text_field.vocab.vectors
num_epoch = 10
lr = 0.01
seq_len = 100
hidden_size1 = 250
hidden_size2 = 200
out_model_file = 'textfnn.pt'

textfnn = FNN(embedding_dim, seq_len, hidden_size1, hidden_size2,
              out_dim, pretrained_embed).to(device)
opt = torch.optim.Adam(textfnn.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

print(f"Training begin!")
loss_list = train_fun(textfnn, train_iter, valid_iter, num_epoch, opt, criterion, eva,
                      out_model_file)

Training begin!
Epoch: 1/10. Total loss: 318.329. Validation Set Acc: 48.720%.
Epoch: 2/10. Total loss: 285.645. Validation Set Acc: 48.920%.
Epoch: 3/10. Total loss: 283.161. Validation Set Acc: 62.320%.
Epoch: 4/10. Total loss: 278.264. Validation Set Acc: 66.700%.
Epoch: 5/10. Total loss: 217.278. Validation Set Acc: 69.860%.
Epoch: 6/10. Total loss: 187.332. Validation Set Acc: 70.340%.
Epoch: 7/10. Total loss: 163.138. Validation Set Acc: 70.180%.
Early Stop!


In [0]:
eva(textfnn, test_iter)

0.70395

# 分类器2:TextCNN

In [0]:
class CNN(nn.Module):
    def __init__(self, in_channels, out_channels, filter_list):
        super(CNN, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels, out_channels, kernel_size=fs)
            for fs in filter_list
        ])
        self.relu = nn.ReLU()
        self.init_params()
    
    def forward(self, x):
        return [self.relu(conv(x)) for conv in self.convs]
    
    def init_params(self):
        for m in self.convs:
            nn.init.kaiming_normal_(m.weight.data)
            nn.init.constant_(m.bias.data, 0)

In [0]:
class TextCNN(nn.Module):
    def __init__(self, embed_dim, pretrained_embed, filter_size, filter_channels,
                 out_dim, dropout_rate):
        super(TextCNN, self).__init__()
        self.embed = nn.Embedding.from_pretrained(pretrained_embed, freeze=True)
        self.cnn = CNN(embed_dim, filter_channels, filter_size)
        self.proj = nn.Linear(len(filter_size) * filter_channels, out_dim)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.init_params()

    def init_params(self):
        nn.init.xavier_normal_(self.proj.weight.data)
        nn.init.constant_(self.proj.bias.data, 0)
    
    def forward(self, x):
        text, _ = x
        text = text.permute(1, 0) # [bs, seq_len]
        out = self.embed(text) # [bs, seq, emb]
        out = out.permute(0, 2, 1) # [bs, emb, seq]
        out = self.cnn(out) # [nf, bs, fc, -1]
        out = [torch.max_pool1d(x, x.shape[-1]).squeeze(2) for x in out] # [nf, bs, fc]
        out = self.dropout(torch.cat(out, dim=1)) 
        return self.proj(out)

In [0]:
embed_dim = 300
pretrained_embed = text_field.vocab.vectors
filter_size = [1, 2, 3, 4, 5]
filter_channels = 200
out_dim = 2
dropout_rate = 0.2
num_epoch = 100
lr = 0.01
out_model_file = 'textcnn.pt'

textcnn = TextCNN(embed_dim, pretrained_embed, filter_size, filter_channels,
                  out_dim, dropout_rate).to(device)
opt = torch.optim.Adam(textcnn.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

loss_list = train_fun(textcnn, train_iter, valid_iter, num_epoch, opt, criterion, eva,
                      out_model_file)

Epoch: 20/100. Total loss: 204.973. Validation Set Acc: 75.440%.
Epoch: 40/100. Total loss: 217.076. Validation Set Acc: 75.240%.
Epoch: 60/100. Total loss: 194.838. Validation Set Acc: 75.420%.
Epoch: 80/100. Total loss: 165.143. Validation Set Acc: 76.000%.
Epoch: 100/100. Total loss: 170.690. Validation Set Acc: 76.220%.


In [0]:
textcnn = TextCNN(embed_dim, pretrained_embed, filter_size, filter_channels,
                  out_dim, dropout_rate).to(device)
textcnn.load_state_dict(torch.load('textcnn.pt'))
eva(textcnn, test_iter)

0.7583

# 分类器3:TextRNN

In [0]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bidirectional,
                 dropout_rate):
        super(LSTM, self).__init__()
        self.rnn = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                           num_layers=num_layers, bidirectional=bidirectional,
                           dropout=dropout_rate)
            
    def forward(self, x, length):
        packed_x = nn.utils.rnn.pack_padded_sequence(x, length)
        packed_output, (hidden, cell) = self.rnn(packed_x)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        return hidden, output

In [0]:
class TextRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers, bidirectional, out_dim,
                 dropout_rate, pretrained_embed):
        super(TextRNN, self).__init__()
        self.embed = nn.Embedding.from_pretrained(pretrained_embed, freeze=True)
        self.rnn = LSTM(embed_size, hidden_size, num_layers, bidirectional,
                        dropout_rate)
        self.proj = nn.Linear(2*hidden_size, out_dim)
        self.dropout = nn.Dropout(p=dropout_rate)
    
    def forward(self, x):
        text, text_length = x # text: [seq_len, bs]
        text = text.permute(1, 0) # text: [bs, seq_len]
        embed_x = self.embed(text) # embed_x: [bs, seq_len, embed_dim]
        embed_x = embed_x.permute(1, 0, 2) # embed_x: [seq_len, bs, embed_dim]
        hidden, _ = self.rnn(embed_x, text_length) # hidden: [2*num_layers, bs, hidden_size]
        hidden = torch.cat((hidden[-1,:,:], hidden[-2,:,:]), dim=1)
        return self.proj(self.dropout(hidden))

In [0]:
embed_size = 300
hidden_size = 100
num_layers = 2
bidirectional = True
out_dim = 2
dropout_rate = 0.2
pretrained_embed = text_field.vocab.vectors
lr = 0.01
num_epoch = 5
out_model_file = 'textrnn.pt'

textrnn = TextRNN(embed_size, hidden_size, num_layers, bidirectional, out_dim,
                  dropout_rate, pretrained_embed).to(device)
opt = torch.optim.Adam(textrnn.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
print("Training begin!")
loss_list = train_fun(textrnn, train_iter, valid_iter, num_epoch, opt, criterion,
                      eva, out_model_file)

Training begin!
Epoch: 1/5. Total loss: 204.548. Validation Set Acc: 79.900%.
Epoch: 2/5. Total loss: 162.675. Validation Set Acc: 80.560%.
Epoch: 3/5. Total loss: 138.629. Validation Set Acc: 80.780%.
Epoch: 4/5. Total loss: 122.616. Validation Set Acc: 80.940%.
Epoch: 5/5. Total loss: 106.104. Validation Set Acc: 80.820%.
Early Stop!


In [0]:
textrnn.load_state_dict(torch.load('textrnn.pt'))
eva(textrnn, test_iter)

0.81315

# 分类器4:TextRCNN

In [0]:
class TextRCNN(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers, bidirectional,
                 out_dim, dropout_rate, pretrained_embed):
        super(TextRCNN, self).__init__()
        self.embed = nn.Embedding.from_pretrained(pretrained_embed, freeze=True)
        self.rnn = nn.GRU(input_size=embed_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional,
                          dropout=dropout_rate)
        self.proj1 = nn.Linear(2 * hidden_size + embed_size, 2 * hidden_size)
        self.tanh = nn.Tanh()
        self.proj2 = nn.Linear(2 * hidden_size, out_dim)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.init_params()
    
    def init_params(self):
        nn.init.xavier_uniform_(self.proj1.weight.data)
        nn.init.xavier_uniform_(self.proj2.weight.data)
        nn.init.constant_(self.proj1.bias.data, 0)
        nn.init.constant_(self.proj2.bias.data, 0)
    
    def forward(self, x):
        text, _ = x # text: [seq_len, bs]
        embed_x = self.embed(text) # emebd_x: [seq_len, bs, embed_dim]
        output, _ = self.rnn(embed_x) # output: [seq_len, bs, 2*hidden_size]
        embed_x, output = embed_x.permute(1, 0, 2), output.permute(1, 0, 2) # [bs, seq_len, 2*hidden_size]
        y = torch.cat((output, embed_x), 2) # y: [bs, seq_len, 2*hidden_size+embed_dim]
        z = self.tanh(self.proj1(y)) # z: [bs, seq_len, 2*hidden_size]
        z = z.permute(0, 2, 1) # z: [bs, 2*hidden_size, seq_len]
        z = torch.max_pool1d(z, z.shape[-1]).squeeze(-1) # z: [bs, 2*hidden_size]
        return self.proj2(self.dropout(z))

In [0]:
embed_size = 300
hidden_size = 100
num_layers = 2
bidirectional = True
out_dim = 2
dropout_rate = 0.2
pretrained_embed = text_field.vocab.vectors
lr = 0.01
num_epoch = 10
out_model_file = 'textrcnn.pt'

textrcnn = TextRCNN(embed_size, hidden_size, num_layers, bidirectional, out_dim,
                   dropout_rate, pretrained_embed).to(device)
opt = torch.optim.Adam(textrcnn.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
print("Training begin!")
loss_list = train_fun(textrcnn, train_iter, valid_iter, num_epoch, opt, criterion,
                      eva, out_model_file)

Training begin!
Epoch: 1/10. Total loss: 210.249. Validation Set Acc: 80.920%.
Epoch: 2/10. Total loss: 163.259. Validation Set Acc: 80.240%.
Early Stop!


In [0]:
textrcnn.load_state_dict(torch.load('textrcnn.pt'))
eva(textrcnn, test_iter)

0.80425

# 分类器5:TextHAN (GRU+Attention)

In [0]:
import torch.nn as nn

class TextHAN(nn.Module):
    def __init__(self, embed_size, hidden_size, num_layers, bidirectional, out_dim,
                 dropout_rate, pretrained_embed):
        super(TextHAN, self).__init__()
        self.embed = nn.Embedding.from_pretrained(pretrained_embed, freeze=True)
        self.rnn = nn.GRU(input_size=embed_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional,
                          dropout=dropout_rate)
        self.proj1 = nn.Linear(2 * hidden_size, 2 * hidden_size)
        self.tanh = nn.Tanh()
        self.u = nn.Parameter(torch.Tensor(2 * hidden_size, 1))
        self.proj2 = nn.Linear(2 * hidden_size, out_dim)
        self.init_params()

    def init_params(self):
        nn.init.xavier_uniform_(self.proj1.weight.data)
        nn.init.xavier_uniform_(self.proj2.weight.data)
        nn.init.constant_(self.proj1.bias.data, 0.1)
        nn.init.constant_(self.proj2.bias.data, 0.1)
        nn.init.uniform_(self.u, -0.1, 0.1)
    
    def forward(self, x):
        text, _ = x # text: [seq_len, bs]
        embed_x = self.embed(text) # embed_x: [seq_len, bs, embed_dim]
        rnn_x, _ = self.rnn(embed_x) # rnn_x: [seq_len, bs, hidden_size * 2]
        rnn_x = rnn_x.permute(1, 0, 2) # rnn_x: [bs, seq_len, hidden_size * 2]
        ut = self.tanh(self.proj1(rnn_x)) # ut: [bs, seq_len, hidden_size * 2]
        alpha = torch.softmax(torch.matmul(ut, self.u), dim=1) # alpha: [bs, seq_len, 1]
        s = torch.sum(alpha * rnn_x, dim=1) # s: [bs, hidden*2]
        return self.proj2(s)

In [0]:
embed_size = 300
hidden_size = 100
num_layers = 2
bidirectional = True
out_dim = 2
dropout_rate = 0.2
pretrained_embed = text_field.vocab.vectors
lr = 0.01
num_epoch = 10
out_model_file = 'texthan.pt'

texthan = TextHAN(embed_size, hidden_size, num_layers, bidirectional, out_dim,
                  dropout_rate, pretrained_embed).to(device)
opt = torch.optim.Adam(texthan.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
print("Training begin!")
loss_list = train_fun(texthan, train_iter, valid_iter, num_epoch, opt, criterion,
                      eva, out_model_file)

Training begin!
Epoch: 1/10. Total loss: 192.921. Validation Set Acc: 81.140%.
Epoch: 2/10. Total loss: 145.733. Validation Set Acc: 80.260%.
Early Stop!


In [0]:
texthan.load_state_dict(torch.load('texthan.pt'))
eva(texthan, test_iter)

0.8174

# 总结

在该项目中实现了5个文本分类模型，测试集上分类的准确率分别为：

- TextFNN：70.395%
- TextCNN: 75.830%
- TextRNN: 81.315%
- TextRCNN: 80.425%
- TextHAN: 81.740%