In [2]:
import os
import sys
import random
import logging

sys.path.append('/home/chaipf/work')
logging.captureWarnings(True)

import torch 
import torch.nn as nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from paddlenlp.datasets import load_dataset
from tqdm.notebook import tqdm
from tensorboardX  import SummaryWriter

from zh_nlp_demo.utils.tokenization import FullTokenizer

# https://bravey.github.io/2020-05-12-使用LSTM+Pytorch对电影评论进行情感分类.html

In [3]:
sequence_length = 200
input_size = 128
hidden_size = 128
num_layers = 2
num_classes = 2
batch_size = 16
num_epochs = 2
learning_rate = 0.003
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def set_seed(seed):
    # seed
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        # 并行gpu
        torch.cuda.manual_seed_all(seed)
        # cpu/gpu结果一致
        torch.backends.cudnn.deterministic = True
        # 训练集变化不大时使训练加速
        # torch.backends.cudnn.benchmark = True


set_seed(42)

In [4]:
[train_examples, dev_examples, test_examples] = load_dataset('chnsenticorp', splits=('train', 'dev', 'test'))

print('训练集样本数量： ', len(train_examples))
print('验证集样本数量： ', len(dev_examples))
print('测试集样本数量： ', len(test_examples))

print('训练集样本示例：')
print(train_examples[0])

训练集样本数量：  9600
验证集样本数量：  1200
测试集样本数量：  1200
训练集样本示例：
{'text': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般', 'label': 1, 'qid': ''}


In [56]:
tokenizer = FullTokenizer('../../data/dict/vocab.txt')


class MyDataset(Dataset):
    def __init__(self, tokenizer, examples):
        self.data, self.label = self.examples_to_ids(tokenizer, examples)

    def examples_to_ids(self, tokenizer, examples, maxlen=200, num_classes=2):
        input_ids, input_y = [], []
        for example in examples:
            token = tokenizer.tokenize(example['text'])
            if len(token) < maxlen:
                token = ['[PAD]'] * (maxlen - len(token)) + token
            else:
                token = token[:maxlen]
            input_ids.append(tokenizer.convert_tokens_to_ids(token))
            input_y.append(example['label'])

        return input_ids, input_y

    def __getitem__(self, index):
        return self.data[index], self.label[index]

    def __len__(self):
        return len(self.data)


def custom_collate(batch):
    input_data, input_label = [], []
    for text, label in batch:
        input_data.append(text)
        input_label.append(label)
    
    input_label = torch.tensor(input_label)
    input_data = torch.tensor(input_data)
    return input_data, input_label


train_dataset = MyDataset(tokenizer, train_examples)
# iter = DatasetIterater(train_dataset, config.batch_size, config.device)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

dev_dataset = MyDataset(tokenizer, dev_examples)
dev_data_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

test_dataset = MyDataset(tokenizer, test_examples)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

In [63]:

class BiRNN(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size, num_classes):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x):

        text_emb = self.embedding(x)
        output, _ = self.lstm(text_emb)

        text_fea = self.fc(output[:, -1, :])
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.softmax(text_fea, dim=1)
        return text_out


vocab_size = len(tokenizer.vocab)

# 实例化一个Birectional RNN模型
model = BiRNN(vocab_size, input_size, hidden_size, num_classes).to(device)
model = model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# 学习率调整
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 10, gamma=0.1)


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


#一个epoch的训练逻辑
def train(model, epoch):
    model.train()
    for iter_index, data in enumerate(train_data_loader):
        inputs, labels = data[0].to(device), data[1].to(device)
        # 初始为0，清除上个batch的梯度信息
        optimizer.zero_grad()
        outputs = model(inputs)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if iter_index % 100 == 99:
            print('epoch: {}  loss: {}'.format(epoch, loss.item()))
        # scheduler.step()


def validation(model):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for data in dev_data_loader:
        with torch.no_grad():
            # 正常传播
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)

        logits = outputs

        # total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(test_data_loader)
    print('Accuracy: %.4f' % (avg_val_accuracy))
    # print('Average testing loss: %.4f' % (total_eval_loss / len(test_data_loader)))
    print('-------------------------------')


for epoch in range(5):
    train(model, epoch)
    validation(model)

epoch: 0  loss: 0.6084384322166443
epoch: 0  loss: 0.5750779509544373
epoch: 0  loss: 0.5789977312088013
epoch: 0  loss: 0.862703263759613
epoch: 0  loss: 0.4099316895008087
epoch: 0  loss: 0.4728037714958191
Accuracy: 0.7725
-------------------------------
epoch: 1  loss: 0.658536434173584
epoch: 1  loss: 0.4510433077812195
epoch: 1  loss: 0.4529229402542114
epoch: 1  loss: 0.5922995805740356
epoch: 1  loss: 0.503049373626709
epoch: 1  loss: 0.4114025831222534
Accuracy: 0.8317
-------------------------------
epoch: 2  loss: 0.540065348148346
epoch: 2  loss: 0.5054954290390015
epoch: 2  loss: 0.46210166811943054
epoch: 2  loss: 0.4591887295246124
epoch: 2  loss: 0.3746809959411621
epoch: 2  loss: 0.5050589442253113
Accuracy: 0.8583
-------------------------------
epoch: 3  loss: 0.4993102550506592
epoch: 3  loss: 0.5483786463737488
epoch: 3  loss: 0.37605202198028564
epoch: 3  loss: 0.4025033414363861
epoch: 3  loss: 0.378490686416626
epoch: 3  loss: 0.34969961643218994
Accuracy: 0.868

In [64]:
tokenizer = FullTokenizer('../../data/dict/vocab.txt')


class MyDataset(Dataset):
    def __init__(self, tokenizer, examples):
        self.data, self.label = self.examples_to_ids(tokenizer, examples)

    def examples_to_ids(self, tokenizer, examples, maxlen=200, num_classes=2):
        tokens = [tokenizer.tokenize(example['text']) for example in examples]
        input_ids = [torch.tensor(tokenizer.convert_tokens_to_ids(tokens), dtype=torch.int64) for tokens in tokens]
        input_y = [example['label'] for example in examples]
        return input_ids, input_y

    def __getitem__(self, index):
        return self.data[index], self.label[index]

    def __len__(self):
        return len(self.data)


def custom_collate(batch):
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    data_length = [len(sq[0]) for sq in batch]
    input_data, input_label = [], []
    for text, label in batch:
        input_data.append(text)
        input_label.append(label)
    input_data = pad_sequence(input_data, batch_first=True, padding_value=0)
    input_label = torch.tensor(input_label)
    return input_data, input_label, data_length


train_dataset = MyDataset(tokenizer, train_examples)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

dev_dataset = MyDataset(tokenizer, dev_examples)
dev_data_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

test_dataset = MyDataset(tokenizer, test_examples)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)

In [66]:

class BiRNN(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size, num_classes):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True, bidirectional=True)
        # 隐层包含向前层和向后层两层，所以隐层共有两倍的Hidden_size
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x, text_len):

        text_emb = self.embedding(x)
        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[:, 0, :]
        # out_reverse = output[:, 0, :]
        # out_reduced = torch.cat((out_forward, out_reverse), 1)

        text_fea = self.fc(out_forward)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.softmax(text_fea, dim=1)
        return text_out


vocab_size = len(tokenizer.vocab)

# 实例化一个Birectional RNN模型
model = BiRNN(vocab_size, input_size, hidden_size, num_classes).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
# 学习率调整
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 10,gamma=0.1)


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


#一个epoch的训练逻辑
def train(model, epoch):
    model.train()
    model = model.to(device)
    for iter_index, data in enumerate(train_data_loader):
        inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]
        # 初始为0，清除上个batch的梯度信息
        optimizer.zero_grad()
        outputs = model(inputs, batch_seq_len)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if iter_index % 300 == 299:
            print('epoch: {}  loss: {}'.format(epoch, loss.item()))
        # scheduler.step()


def validation(model):
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for data in dev_data_loader:
        with torch.no_grad():
            # 正常传播
            inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]
            outputs = model(inputs, batch_seq_len)

        logits = outputs

        # total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)

    avg_val_accuracy = total_eval_accuracy / len(test_data_loader)
    print('Accuracy: %.4f' % (avg_val_accuracy))
    # print('Average testing loss: %.4f' % (total_eval_loss / len(test_data_loader)))
    print('-------------------------------')


for epoch in range(5):
    train(model, 0)
    validation(model)

epoch: 0  loss: 0.7332198619842529
epoch: 0  loss: 0.6996564269065857
Accuracy: 0.4942
-------------------------------
epoch: 0  loss: 0.6862127184867859
epoch: 0  loss: 0.6927137970924377
Accuracy: 0.4950
-------------------------------
epoch: 0  loss: 0.6892074942588806
epoch: 0  loss: 0.6802089214324951
Accuracy: 0.5033
-------------------------------
epoch: 0  loss: 0.7204221487045288
epoch: 0  loss: 0.6712704300880432
Accuracy: 0.4983
-------------------------------
epoch: 0  loss: 0.6740809679031372
epoch: 0  loss: 0.6814371347427368
Accuracy: 0.4967
-------------------------------


In [6]:
class AvgrageMeter(object):

    def __init__(self):
        self.reset()

    def reset(self):
        self.avg = 0
        self.sum = 0
        self.cnt = 0

    def update(self, val, n=1):
        self.sum += val * n
        self.cnt += n
        self.avg = self.sum / self.cnt


#混淆矩阵指标
class ConfuseMeter(object):

    def __init__(self):
        self.reset()

    def reset(self):
        # 标签的分类：0 pos 1 neg 
        self.confuse_mat = torch.zeros(2,2)
        self.tp = self.confuse_mat[0,0]
        self.fp = self.confuse_mat[0,1]
        self.tn = self.confuse_mat[1,1]
        self.fn = self.confuse_mat[1,0]
        self.acc = 0
        self.pre = 0
        self.rec = 0
        self.F1 = 0
    def update(self, output, label):
        pred = output.argmax(dim = 1)
        for l, p in zip(label.view(-1),pred.view(-1)):
            self.confuse_mat[p.long(), l.long()] += 1 # 对应的格子加1
        self.tp = self.confuse_mat[0,0]
        self.fp = self.confuse_mat[0,1]
        self.tn = self.confuse_mat[1,1]
        self.fn = self.confuse_mat[1,0]
        self.acc = (self.tp+self.tn) / self.confuse_mat.sum()
        self.pre = self.tp / (self.tp + self.fp)
        self.rec = self.tp / (self.tp + self.fn)
        self.F1 = 2 * self.pre*self.rec / (self.pre + self.rec)


## topk的准确率计算
def accuracy(output, label, topk=(1,)):
    maxk = max(topk) 
    batch_size = label.size(0)
    
    # 获取前K的索引
    _, pred = output.topk(maxk, 1, True, True) # 使用topk来获得前k个的索引
    pred = pred.t() # 进行转置
    # eq按照对应元素进行比较 view(1,-1) 自动转换到行为1,的形状， expand_as(pred) 扩展到pred的shape
    # expand_as 执行按行复制来扩展，要保证列相等
    correct = pred.eq(label.view(1, -1).expand_as(pred)) # 与正确标签序列形成的矩阵相比，生成True/False矩阵
    # print(correct)
    
    rtn = []
    for k in topk:
        correct_k = correct[:k].contiguous().view(-1).float().sum(0) # 前k行的数据 然后平整到1维度，来计算true的总个数
        rtn.append(correct_k.mul_(100.0 / batch_size)) # mul_() ternsor 的乘法  正确的数目/总的数目 乘以100 变成百分比
    return rtn

In [12]:

class BiRNN(nn.Module):
    def __init__(self, vocab_size, input_size, hidden_size, num_classes):
        super(BiRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, num_classes)  # 隐层包含向前层和向后层两层，所以隐层共有两倍的Hidden_size
    
    def forward(self, x, text_len):

        text_emb = self.embedding(x)
        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[:, -1, :]
        # out_reverse = output[:, 0, :]
        # out_reduced = torch.cat((out_forward, out_reverse), 1)

        text_fea = self.fc(out_forward)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)
        return text_out


vocab_size = len(tokenizer.vocab)

# 实例化一个Birectional RNN模型
model = BiRNN(vocab_size, input_size, hidden_size, num_classes).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# 学习率调整
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 10,gamma=0.1)


In [8]:
#一个epoch的训练逻辑
def train(epoch, train_loader, device, model, criterion, optimizer, scheduler, tensorboard_path):
    model.train()
    top1 = AvgrageMeter()
    model = model.to(device)
    train_loss = 0.0
    for i, data in enumerate(train_loader, 0):  # 0是下标起始位置默认为0
        inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]
        # 初始为0，清除上个batch的梯度信息
        optimizer.zero_grad()
        outputs = model(inputs,batch_seq_len)

        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()

        _, pred = outputs.topk(1)

        prec1, prec2= accuracy(outputs, labels, topk=(1,2))
        n = inputs.size(0)
        top1.update(prec1.item(), n)
        train_loss += loss.item()
        postfix = {'train_loss': '%.6f' % (train_loss / (i + 1)), 'train_acc': '%.6f' % top1.avg}
        train_loader.set_postfix(log=postfix)

        # ternsorboard 曲线绘制
        if os.path.exists(tensorboard_path) == False: 
            os.mkdir(tensorboard_path)    
        writer = SummaryWriter(tensorboard_path)
        writer.add_scalar('Train/Loss', loss.item(), epoch)
        writer.add_scalar('Train/Accuracy', top1.avg, epoch)
        writer.flush()
    scheduler.step()


def validate(epoch, validate_loader, device, model, criterion, tensorboard_path):
    val_acc = 0.0
    model = model.to(device)
    model.eval()
    with torch.no_grad():  # 进行评测的时候网络不更新梯度
        val_top1 = AvgrageMeter()
        validate_loader = tqdm(validate_loader)
        validate_loss = 0.0
        for i, data in enumerate(validate_loader):
            inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]

            outputs = model(inputs, batch_seq_len)
            loss = criterion(outputs, labels)

            prec1, prec2 = accuracy(outputs, labels, topk=(1, 2))
            n = inputs.size(0)
            val_top1.update(prec1.item(), n)
            validate_loss += loss.item()
            postfix = {'validate_loss': '%.6f' % (validate_loss / (i + 1)), 'validate_acc': '%.6f' % val_top1.avg}
            validate_loader.set_postfix(log=postfix)
            
            # ternsorboard 曲线绘制
            if os.path.exists(tensorboard_path) == False: 
                os.mkdir(tensorboard_path)    
            writer = SummaryWriter(tensorboard_path)
            writer.add_scalar('Validate/Loss', loss.item(), epoch)
            writer.add_scalar('Validate/Accuracy', val_top1.avg, epoch)
            writer.flush()
        val_acc = val_top1.avg
    return val_acc


def test(validate_loader, device, model, criterion):
    val_acc = 0.0
    model = model.to(device)
    model.eval()
    confuse_meter = ConfuseMeter()
    with torch.no_grad():
        val_top1 = AvgrageMeter()
        validate_loader = tqdm(validate_loader)
        validate_loss = 0.0
        for i, data in enumerate(validate_loader, 0):  # 0是下标起始位置默认为0
            inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]
            outputs, _ = model(inputs, batch_seq_len)

            prec1, prec2 = accuracy(outputs, labels, topk=(1, 2))
            n = inputs.size(0)
            val_top1.update(prec1.item(), n)
            confuse_meter.update(outputs, labels)
            postfix = {
                'test_acc': '%.6f' % val_top1.avg,
                'confuse_acc': '%.6f' % confuse_meter.acc
            }
            validate_loader.set_postfix(log=postfix)
        val_acc = val_top1.avg
    return confuse_meter

In [9]:
tensorboard_path = './log'

for epoch in range(num_epochs):
    train_data_loader = tqdm(train_data_loader)
    train_data_loader.set_description('[%s%04d/%04d %s%f]' % ('Epoch:', epoch + 1, num_epochs, 'lr:', scheduler.get_lr()[0]))
    
    train(epoch, train_data_loader, device, model, criterion, optimizer,scheduler, tensorboard_path)
    validate(epoch, dev_data_loader, device, model, criterion, tensorboard_path)

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/600 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

In [10]:
total_step = len(train_data_loader)

for epoch in range(num_epochs):
    for i, data in enumerate(train_data_loader):
        inputs, labels, batch_seq_len = data[0].to(device), data[1].to(device), data[2]

        # 前向传播
        outputs = model(inputs, batch_seq_len)
        loss = criterion(outputs, labels)
        
        # 反向传播和优化，注意梯度每次清零
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))



RuntimeError: cudnn RNN backward can only be called in training mode