In [2]:
import re
from d2l import torch as d2l
import torch
import jieba
from sklearn.model_selection import train_test_split
import torch.nn as nn
import bz2

In [None]:
def read_comments(path):
    '''读取评论和对应的评分'''
    #这是我心血来潮写的数据预处理脑抽代码
    with open(path,"r",encoding='utf-8') as fp:
        #第一行是数据指标标签，不要，最后一行是一个空格，不要
        reviews = fp.read().split('\n')[1:-1]
    #csv文件的每个格子以逗号划分
    all_split_dot = [[i.start() for i in re.finditer(',',review)] for review in reviews]
    #提取评分
    labels = torch.tensor([int(float(reviews[i][all_split_dot[i][-1]+1:])) for i in range(len(all_split_dot))]) -1
    #提取评论
    comments = [reviews[i][all_split_dot[i][3]+1:all_split_dot[i][-1]].replace('"','').replace(' ',"") for i in range(len(all_split_dot))]
    #把每条评论进行分词分词
    comments = [[i for i in jieba.cut(comment,cut_all=False)]for comment in comments]
    return comments,labels

In [None]:
def process_data(root,batch_size,num_steps=60):
    """返回数据迭代器和评论数据集的词表"""
    #把评论和对应的评分取出来
    comments,labels = read_comments(root)
    #创建vocab，使得每个词都有对应的索引，将词频小于3的词不要了，不然这个vocab太大了
    vocab = d2l.Vocab(comments, min_freq=3,reserved_tokens=['<pad>'])
    #划分训练集和测试集
    train_comment, test_comment, train_label, test_label = train_test_split(comments,labels,test_size=0.2)
    #鉴于绝大部分评论都在60个词以下，把每一条评论都处理成相同长度（60个词），对短评论进行填充'pad',对长评论进行截断
    train_features = torch.tensor([d2l.truncate_pad(
        vocab[comment], num_steps, vocab['<pad>']) for comment in train_comment])
    test_features = torch.tensor([d2l.truncate_pad(
        vocab[comment], num_steps, vocab['<pad>']) for comment in test_comment])
    #返回数据迭代器
    train_iter = d2l.load_array((train_features,train_label),
                                batch_size)
    test_iter = d2l.load_array((test_features,test_label),
                               batch_size,
                               is_train=False)
    return train_iter, test_iter, vocab

In [None]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens,
                 num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # 将bidirectional设置为True以获取双向循环神经网络
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers,
                                bidirectional=True)
        #这里评分有5个等级
        self.decoder = nn.Linear(4 * num_hiddens, 5)

    def forward(self, inputs):
        # inputs的形状是（批量大小，时间步数）
        # 因为长短期记忆网络要求其输入的第一个维度是时间维，
        # 所以在获得词元表示之前，输入会被转置。
        # 输出形状为（时间步数，批量大小，词向量维度）
        embeddings = self.embedding(inputs.T)
        self.encoder.flatten_parameters()
        # 返回上一个隐藏层在不同时间步的隐状态，
        # outputs的形状是（时间步数，批量大小，2*隐藏单元数）
        outputs, _ = self.encoder(embeddings)
        # 连结初始和最终时间步的隐状态，作为全连接层的输入，
        # 其形状为（批量大小，4*隐藏单元数）
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder(encoding)
        return outs

In [None]:
class TokenEmbedding:

    def __init__(self, file_path):

        self.idx_to_token, self.idx_to_vec = self._load_embedding(
            file_path)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in
                             enumerate(self.idx_to_token)}

    def _load_embedding(self, file_path):
        idx_to_token, idx_to_vec = ['<unk>'], []

        #！！！！！这里得用bz2.open打开文件
        with bz2.open(file_path, 'rb') as fp:
            word_vecs = fp.readlines()
        word_vecs = [i.decode('utf-8') for i in word_vecs][1:] #第一行信息没啥用，所以不要了
        for vec in word_vecs:
            #消除每一行后面的‘\n’,已经后面多余空格，然后依据空格划分元素，形成一个列表
            elems = vec.rstrip().rstrip('\n').split(' ')
            #每一行的第一个元素是词，剩余的元素是词向量
            token,elems = elems[0],[float(elem) for elem in elems[1:]]
            idx_to_token.append(token)
            idx_to_vec.append(elems)

        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, d2l.tensor(idx_to_vec)

    def __getitem__(self, tokens):
        indices = [self.token_to_idx.get(token, self.unknown_idx)
                   for token in tokens]
        vecs = self.idx_to_vec[d2l.tensor(indices)]
        return vecs

    def __len__(self):
        return len(self.idx_to_token)

In [None]:
def train_one_epoch(net, X, y, loss, trainer, device):

    if isinstance(X, list):
        X = [x.to(device) for x in X]
    else:
        X = X.to(device)

    y = y.to(device)
    net.train()
    trainer.zero_grad()
    pred = net(X)
    l = loss(pred, y)
    l.sum().backward()
    trainer.step()
    train_loss_sum = l.sum()
    train_acc_sum = d2l.accuracy(pred, y)
    return train_loss_sum, train_acc_sum

In [None]:
def train(net, train_iter, test_iter, loss, trainer, num_epochs,
               device=d2l.try_gpu()):

    #权重初始化
    def init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.LSTM:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])

    net.apply(init_weights)

    #将预训练好的词嵌入加载到net的embeding层里，并且不进行梯度回传
    net.embedding.weight.data.copy_(embeds)
    net.embedding.weight.requires_grad = False


    num_batches = len(train_iter)
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],
                            legend=['train loss', 'train acc', 'test acc'])
    net.to(device)
    for epoch in range(num_epochs):
        #定义一个容器，里面存放着训练损失，训练准确度，样本数量
        metric = d2l.Accumulator(3)
        for i, (features, labels) in enumerate(train_iter):


            l, acc = train_one_epoch(net, features, labels, loss, trainer, device)

            metric.add(l, acc, labels.shape[0])

            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (metric[0] / metric[2], metric[1] / metric[2],
                              None))

        #查看测试集的准确率
        test_acc = d2l.evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))

    print(f'平均损失: {metric[0] / metric[2]:.3f}, 训练准确度: '
          f'{metric[1] / metric[2]:.3f}, 测试准确度: {test_acc:.3f}')


In [None]:
batch_size = 512                                   #批量大小
lr = 0.0003                                        #学习率
num_epochs = 20                                    #训练几轮
embed_size = 300                                   #词嵌入维度，我选了300维的
num_hiddens = 256                                  #循环神经网络，隐藏层单元数量
num_layers = 2                                     #多少个隐藏层
devices = d2l.try_gpu()                            #设备

comment_path = ''     #数据集文件路径
train_iter, test_iter, vocab = process_data(comment_path,batch_size)

embed_path = ''     #词向量文件路径
my_embedding = TokenEmbedding(embed_path)
embeds = my_embedding[vocab.idx_to_token]          #把词向量和我的vocab结合起来


net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers)  #定义网络
trainer = torch.optim.Adam(net.parameters(), lr=lr)           #优化器
loss = nn.CrossEntropyLoss(reduction="none")                  #损失函数

In [None]:
train(net, train_iter, test_iter, loss, trainer,num_epochs,d2l.try_gpu())

In [None]:
def predict(net, vocab, comment):
    net.eval()
    comment = torch.tensor(vocab[[i for i in jieba.cut(comment,cut_all=False)]], device=d2l.try_gpu())
    label = torch.argmax(net(comment.reshape(1, -1)), dim=1)
    return label+1

In [None]:
predict(net,vocab,'包装很好,内容也不错')