In [1]:
import logging

logging.captureWarnings(True)

import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from paddlenlp.datasets import load_dataset

from zh_nlp_demo.utils.tokenization import FullTokenizer


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# https://bravey.github.io/2020-05-12-使用LSTM+Pytorch对电影评论进行情感分类.html

In [None]:
sequence_length = 200
input_size = 128
hidden_size = 128
num_layers = 2
num_classes = 2
batch_size = 16
num_epochs = 2
learning_rate = 0.003

In [2]:
[train_examples, dev_examples, test_examples] = load_dataset('chnsenticorp', splits=('train', 'dev', 'test'))

print('训练集样本数量： ', len(train_examples))
print('验证集样本数量： ', len(dev_examples))
print('测试集样本数量： ', len(test_examples))

print('训练集样本示例：')
print(train_examples[0])

训练集样本数量：  9600
验证集样本数量：  1200
测试集样本数量：  1200
训练集样本示例：
{'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般', 'label': 1, 'qid': ''}


In [3]:
tokenizer = FullTokenizer('../../data/dict/vocab.txt')


class MyDataset(Dataset):
    def __init__(self, tokenizer, examples):
        self.data, self.label = self.examples_to_ids(tokenizer, examples)

    def examples_to_ids(self, tokenizer, examples, maxlen=200, num_classes=2):
        tokens = [tokenizer.tokenize(example['text']) for example in examples]
        input_ids = [torch.tensor(tokenizer.convert_tokens_to_ids(tokens), dtype=torch.int64) for tokens in tokens]
        input_y = [example['label'] for example in examples]
        return input_ids, input_y

    def __getitem__(self, index):
        return self.data[index], self.label[index]

    def __len__(self):
        return len(self.data)


def custom_collate(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    data_length = [len(sq[0]) for sq in batch]
    input_data, input_label = [], []
    for text, label in batch:
        input_data.append(text)
        input_label.append(label)
    input_data = pad_sequence(input_data, batch_first=True, padding_value=0)
    input_label = torch.tensor(input_label)
    return input_data, input_label, data_length


train_dataset = MyDataset(tokenizer, train_examples)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

In [4]:

class BiRNN(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size, num_classes):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # 隐层包含向前层和向后层两层，所以隐层共有两倍的Hidden_size
    
    def forward(self, x, text_len):

        text_emb = self.embedding(x)
        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[:, -1, :]
        # out_reverse = output[:, 0, :]
        # out_reduced = torch.cat((out_forward, out_reverse), 1)

        text_fea = self.fc(out_forward)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)
        return text_out


vocab_size = len(tokenizer.vocab)

# 实例化一个Birectional RNN模型
model = BiRNN(vocab_size, input_size, hidden_size, num_classes).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [5]:
total_step = len(train_data_loader)

for epoch in range(num_epochs):
    for i, data in enumerate(train_data_loader):
        inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]

        # 前向传播
        outputs = model(inputs, batch_seq_len)
        loss = criterion(outputs, labels)
        
        # 反向传播和优化，注意梯度每次清零
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))



Epoch [1/2], Step [100/2400], Loss: 0.6917
Epoch [1/2], Step [200/2400], Loss: 0.6801
Epoch [1/2], Step [300/2400], Loss: 0.6923
Epoch [1/2], Step [400/2400], Loss: 0.7058
Epoch [1/2], Step [500/2400], Loss: 0.6589
Epoch [1/2], Step [600/2400], Loss: 0.8052
Epoch [1/2], Step [700/2400], Loss: 0.8347
Epoch [1/2], Step [800/2400], Loss: 0.8141
Epoch [1/2], Step [900/2400], Loss: 0.6778
Epoch [1/2], Step [1000/2400], Loss: 0.6705
Epoch [1/2], Step [1100/2400], Loss: 0.7621
Epoch [1/2], Step [1200/2400], Loss: 0.8300
Epoch [1/2], Step [1300/2400], Loss: 0.6492
Epoch [1/2], Step [1400/2400], Loss: 0.6272
Epoch [1/2], Step [1500/2400], Loss: 0.6220
Epoch [1/2], Step [1600/2400], Loss: 0.6599
Epoch [1/2], Step [1700/2400], Loss: 0.6199
Epoch [1/2], Step [1800/2400], Loss: 0.6768
Epoch [1/2], Step [1900/2400], Loss: 0.5765
Epoch [1/2], Step [2000/2400], Loss: 0.7814
Epoch [1/2], Step [2100/2400], Loss: 0.7125
Epoch [1/2], Step [2200/2400], Loss: 0.7287
Epoch [1/2], Step [2300/2400], Loss: 0.61

KeyboardInterrupt: 