## 2. 导入依赖库

In [1]:
import math
import numpy as np
import pandas as pd
import os
import math
import random
import codecs
from pathlib import Path

import mindspore
import mindspore.dataset as ds
import mindspore.nn as nn
from mindspore import Tensor
from mindspore import context
from mindspore.train.model import Model
from mindspore.nn.metrics import Accuracy
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.ops import operations as ops

## 3. 超参数设置

In [2]:
from easydict import EasyDict as edict
# 配置文件设置
cfg = edict({
    'name': 'movie review',
    'pre_trained': False,
    'num_classes': 2,#情感只有正面负面两种所以设置最终分类为2类
    'batch_size': 64,
    'epoch_size': 4,#尝试修改使得存在warmup并没有改进
    'weight_decay': 3e-5,
    'data_path': './data/',
    'device_target': 'Ascend',
    'device_id': 0,
    'keep_checkpoint_max': 1,
    'checkpoint_path': './ckpt/train_textcnn-4_149.ckpt',
    'word_len': 51,
    'vec_length': 40
})

In [3]:
context.set_context(mode=context.GRAPH_MODE, device_target=cfg.device_target, device_id=cfg.device_id)#初始化运行设备，使用静态图模式，指定运行环境

## 4. 数据预处理

In [4]:
# 数据预览
with open("./data/rt-polarity.neg", 'r', encoding='utf-8') as f:#查看前5条数据
        print("Negative reivews:")
        for i in range(5):
            print("[{0}]:{1}".format(i,f.readline()))
with open("./data/rt-polarity.pos", 'r', encoding='utf-8') as f:#同上
        print("Positive reivews:")
        for i in range(5):
            print("[{0}]:{1}".format(i,f.readline()))

Negative reivews:
[0]:simplistic , silly and tedious . 

[1]:it's so laddish and juvenile , only teenage boys could possibly find it funny . 

[2]:exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

[3]:[garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

[4]:a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 

Positive reivews:
[0]:the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

[1]:the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

[2]:effective but too-tepid biopic

In [5]:
class Generator():
    '''
    数据集的操作类
    '''
    def __init__(self, input_list):
        self.input_list=input_list
    def __getitem__(self,item):
        return (np.array(self.input_list[item][0],dtype=np.int32),
                np.array(self.input_list[item][1],dtype=np.int32))
    def __len__(self):
        return len(self.input_list)


class MovieReview:
    '''
    影评数据集
    '''
    def __init__(self, root_dir, maxlen, split):
        '''
        input:
            root_dir: 影评数据目录
            maxlen: 设置句子最大长度
            split: 设置数据集中训练/评估的比例
        '''
        self.path = root_dir
        self.feelMap = {
            'neg':0,
            'pos':1
        }
        self.files = []

        self.doConvert = False
        
        #检查路径
        mypath = Path(self.path)
        if not mypath.exists() or not mypath.is_dir():
            print("please check the root_dir!")
            raise ValueError

        # 在数据目录中找到文件
        for root,_,filename in os.walk(self.path):
            for each in filename:
                self.files.append(os.path.join(root,each))
            break

        # 确认是否为两个文件.neg与.pos
        if len(self.files) != 2:
            print("There are {} files in the root_dir".format(len(self.files)))
            raise ValueError

        # 读取数据
        self.word_num = 0
        self.maxlen = 0
        self.minlen = float("inf")
        self.maxlen = float("-inf")
        self.Pos = []
        self.Neg = []
        for filename in self.files:
            f = codecs.open(filename, 'r')
            ff = f.read()
            file_object = codecs.open(filename, 'w', 'utf-8')
            file_object.write(ff)
            self.read_data(filename)
        self.PosNeg = self.Pos + self.Neg

        self.text2vec(maxlen=maxlen)#建立字典对应one-hot编码
        self.split_dataset(split=split)#划分训练集和测试集

    def read_data(self, filePath):

        with open(filePath,'r') as f:
            
            for sentence in f.readlines():
                sentence = sentence.replace('\n','')\
                                    .replace('"','')\
                                    .replace('\'','')\
                                    .replace('.','')\
                                    .replace(',','')\
                                    .replace('[','')\
                                    .replace(']','')\
                                    .replace('(','')\
                                    .replace(')','')\
                                    .replace(':','')\
                                    .replace('--','')\
                                    .replace('-',' ')\
                                    .replace('\\','')\
                                    .replace('0','')\
                                    .replace('1','')\
                                    .replace('2','')\
                                    .replace('3','')\
                                    .replace('4','')\
                                    .replace('5','')\
                                    .replace('6','')\
                                    .replace('7','')\
                                    .replace('8','')\
                                    .replace('9','')\
                                    .replace('`','')\
                                    .replace('=','')\
                                    .replace('$','')\
                                    .replace('/','')\
                                    .replace('*','')\
                                    .replace(';','')\
                                    .replace('<b>','')\
                                    .replace('%','')
                sentence = sentence.split(' ')#遍历分隔符号将句子划分为单词
                sentence = list(filter(lambda x: x, sentence))#如果使用分词模块建立字典也许会更好
                if sentence:#但是考虑到是英文以及评价词的词性一般不会变化其实影响不大
                    self.word_num += len(sentence)#修改总的字典中单词数量
                    self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence)
                    self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence)#更新单词的最大最小长度
                    if 'pos' in filePath:
                        self.Pos.append([sentence,self.feelMap['pos']])#给数据打上正负标签制作数据集
                    else:
                        self.Neg.append([sentence,self.feelMap['neg']])

    def text2vec(self, maxlen):
        '''
        将句子转化为向量

        '''
        # Vocab = {word : index}
        self.Vocab = dict()

        # self.Vocab['None']
        for SentenceLabel in self.Pos+self.Neg:
            vector = [0]*maxlen#使用一个固定长度maxlen的向量表示该句子
            for index, word in enumerate(SentenceLabel[0]):
                if index >= maxlen:#若该句子长度小于maxlen，则在后面不干什么，用0进行填充；
                    break          #若该句子长度大于等于maxlen，则只选择前maxlen个单词作为文本信息输入
                if word not in self.Vocab.keys():#如果这个位置的值是第一次出现，则将其加入到self.Vocab字典中
                    self.Vocab[word] = len(self.Vocab)
                    vector[index] = len(self.Vocab) - 1#并使用字典的长度-1作为该位置上的向量值；
                else:
                    vector[index] = self.Vocab[word]#如果已经出现过，则直接使用其索引作为该位置上的向量值
            SentenceLabel[0] = vector#对于每条句子，都会被转化为一个maxlen长度的向量，第二个维度是情感标签
        self.doConvert = True

    def split_dataset(self, split):
        '''
        分割为训练集与测试集

        '''

        trunk_pos_size = math.ceil((1-split)*len(self.Pos))
        trunk_neg_size = math.ceil((1-split)*len(self.Neg))
        trunk_num = int(1/(1-split))
        pos_temp=list()
        neg_temp=list()
        for index in range(trunk_num):
            pos_temp.append(self.Pos[index*trunk_pos_size:(index+1)*trunk_pos_size])
            neg_temp.append(self.Neg[index*trunk_neg_size:(index+1)*trunk_neg_size])
        self.test = pos_temp.pop(2)+neg_temp.pop(2)
        self.train = [i for item in pos_temp+neg_temp for i in item]

        random.shuffle(self.train)
        # random.shuffle(self.test)

    def get_dict_len(self):
        '''
        获得数据集中文字组成的词典长度
        '''
        if self.doConvert:
            return len(self.Vocab)
        else:
            print("Haven't finished Text2Vec")
            return -1

    def create_train_dataset(self, epoch_size, batch_size):
        dataset = ds.GeneratorDataset(
                                        source=Generator(input_list=self.train), 
                                        column_names=["data","label"], 
                                        shuffle=False
                                        )
#         dataset.set_dataset_size(len(self.train))
        dataset=dataset.batch(batch_size=batch_size,drop_remainder=True)
        dataset=dataset.repeat(epoch_size)
        return dataset

    def create_test_dataset(self, batch_size):
        dataset = ds.GeneratorDataset(
                                        source=Generator(input_list=self.test), 
                                        column_names=["data","label"], 
                                        shuffle=False
                                        )
#         dataset.set_dataset_size(len(self.test))
        dataset=dataset.batch(batch_size=batch_size,drop_remainder=True)
        return dataset

In [6]:
instance = MovieReview(root_dir=cfg.data_path, maxlen=cfg.word_len, split=0.9)#读入数据划分数据集
dataset = instance.create_train_dataset(batch_size=cfg.batch_size,epoch_size=cfg.epoch_size)#制作数据集
batch_num = dataset.get_dataset_size() 

In [7]:
vocab_size=instance.get_dict_len()#查看单词数以及数据集的情况
print("vocab_size:{0}".format(vocab_size))
item =dataset.create_dict_iterator()
for i,data in enumerate(item):
    if i<1:
        print(data)
        print(data['data'][1])
    else:
        break

vocab_size:18848
{'data': Tensor(shape=[64, 51], dtype=Int32, value=
[[ 128,   15,   89 ...    0,    0,    0],
 [ 118,   15,  218 ...    0,    0,    0],
 [  15, 6705,   10 ...    0,    0,    0],
 ...
 [ 145,    2,   75 ...    0,    0,    0],
 [ 128, 2412,  128 ...    0,    0,    0],
 [   0,  747,  111 ...    0,    0,    0]]), 'label': Tensor(shape=[64], dtype=Int32, value= [1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 
 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 
 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0])}
[118  15 218  11 219  88  32   0 100 220 221 122 222  86 223 224 225 226
 227 228  82   2  36 229  36 230 231 155 232 164  63 155 233 234   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


## 5.模型训练

### 5.1训练参数设置

In [8]:
learning_rate = []#设置lr，预热，在训练初期加速模型收敛；缩小学习率，训练后期缓解梯度爆炸现象，使得模型更加稳定和可靠。
warm_up = [1e-3 / math.floor(cfg.epoch_size / 5) * (i + 1) for _ in range(batch_num) 
           for i in range(math.floor(cfg.epoch_size / 5))]
shrink = [1e-3 / (16 * (i + 1)) for _ in range(batch_num) 
          for i in range(math.floor(cfg.epoch_size * 3 / 5))]
normal_run = [1e-3 for _ in range(batch_num) for i in 
              range(cfg.epoch_size - math.floor(cfg.epoch_size / 5) 
                    - math.floor(cfg.epoch_size * 2 / 5))]
learning_rate = learning_rate + warm_up + normal_run + shrink#（迭代轮次比例按照for i中轮数确定）

In [9]:
def _weight_variable(shape, factor=0.01):#初始化卷积核权重生成
    init_value = np.random.randn(*shape).astype(np.float32) * factor
    return Tensor(init_value)


def make_conv_layer(kernel_size):#制作卷积层，设置相关参数
    weight_shape = (96, 1, *kernel_size)
    weight = _weight_variable(weight_shape)
    return nn.Conv2d(in_channels=1, out_channels=96, kernel_size=kernel_size, padding=1,
                     pad_mode="pad", weight_init=weight, has_bias=True)


class TextCNN(nn.Cell):
    def __init__(self, vocab_len, word_len, num_classes, vec_length):
        super(TextCNN, self).__init__()
        self.vec_length = vec_length
        self.word_len = word_len
        self.num_classes = num_classes

        self.unsqueeze = ops.ExpandDims()#在输入张量中增加一个维度，使其变成四维，其作用是为了适配Conv2d网络层的输入格式。
        self.embedding = nn.Embedding(vocab_len, self.vec_length, embedding_table='normal')#制作词向量，原因在报告中详细叙述，简单来说是因为卷积识别的特征是空间中的相关性。

        self.slice = ops.Slice()
        self.layer1 = self.make_layer(kernel_height=3)#制作三种卷积层
        self.layer2 = self.make_layer(kernel_height=4)
        self.layer3 = self.make_layer(kernel_height=5)

        self.concat = ops.Concat(1)#将第二个维度也就是预测标签拼接在一起

        self.fc = nn.Dense(96*3, self.num_classes)#全连接输出概率96*3是输入维度，也就是上述卷积层输出的维度
        self.drop = nn.Dropout(keep_prob=0.5)#防止过拟合
        self.print = ops.Print()
        self.reducemean = ops.ReduceMax(keep_dims=False)#求各个通道的均值操作
        
    def make_layer(self, kernel_height):
        return nn.SequentialCell(
            [
                make_conv_layer((kernel_height,self.vec_length)),
                nn.ReLU(),#激活函数
                nn.MaxPool2d(kernel_size=(self.word_len-kernel_height+1,1)),#最大值池化
            ]
        )

    def construct(self,x):
        x = self.unsqueeze(x, 1)
        x = self.embedding(x)
        x1 = self.layer1(x)
        x2 = self.layer2(x)
        x3 = self.layer3(x)

        x1 = self.reducemean(x1, (2, 3))
        x2 = self.reducemean(x2, (2, 3))
        x3 = self.reducemean(x3, (2, 3))

        x = self.concat((x1, x2, x3))#表示将经过求取均值后的多个特征图拼接起来，形成一个特征向量。
        x = self.drop(x)
        x = self.fc(x)
        return x

In [11]:
net = TextCNN(vocab_len=instance.get_dict_len(), word_len=cfg.word_len, 
              num_classes=cfg.num_classes, vec_length=cfg.vec_length)#生成自定义网络

In [12]:
print(net)#打印网络结构

TextCNN<
  (embedding): Embedding<vocab_size=18848, embedding_size=40, use_one_hot=False, embedding_table=Parameter (name=embedding.embedding_table, shape=(18848, 40), dtype=Float32, requires_grad=True), dtype=Float32, padding_idx=None>
  (layer1): SequentialCell<
    (0): Conv2d<input_channels=1, output_channels=96, kernel_size=(3, 40), stride=(1, 1), pad_mode=pad, padding=1, dilation=(1, 1), group=1, has_bias=True, weight_init=[[[[ 1.6112439e-02 -2.0757150e-02 -2.0841148e-02 ... -1.8504603e-02
        -2.8975620e-03 -2.7245784e-03]
       [ 1.9803147e-03 -8.4856097e-03 -9.7931791e-03 ... -1.6013958e-02
        -1.8243326e-03 -7.0100249e-04]
       [ 5.8140219e-03 -1.3977945e-02 -6.3684967e-04 ... -7.4508167e-03
         2.0056810e-02  2.3656346e-02]]]
    
    
     [[[-8.3921067e-03  9.8413625e-04 -9.0313461e-03 ... -1.4636106e-02
        -9.8245370e-04  4.8509962e-03]
       [ 1.4972253e-03  2.8891333e-03 -1.3509471e-02 ... -1.8857339e-03
         2.3570815e-02  1.5150110e-02]
    

In [13]:
# Continue training if set pre_trained to be True
if cfg.pre_trained:#针对已经预训练好的网络，加载网络参数
    param_dict = load_checkpoint(cfg.checkpoint_path)
    load_param_into_net(net, param_dict)

In [14]:
opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), 
              learning_rate=learning_rate, weight_decay=cfg.weight_decay)
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)#设置优化器和损失函数

In [15]:
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc': Accuracy()})

In [16]:
config_ck = CheckpointConfig(save_checkpoint_steps=int(cfg.epoch_size*batch_num/2), keep_checkpoint_max=cfg.keep_checkpoint_max)
time_cb = TimeMonitor(data_size=batch_num)
ckpt_save_dir = "./ckpt"
ckpoint_cb = ModelCheckpoint(prefix="train_textcnn", directory=ckpt_save_dir, config=config_ck)
loss_cb = LossMonitor()

In [17]:
model.train(cfg.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])
print("train success")

epoch: 1 step: 596, loss is 0.09578466415405273
epoch time: 17877.591 ms, per step time: 29.996 ms
epoch: 2 step: 596, loss is 0.01619671657681465
epoch time: 4872.288 ms, per step time: 8.175 ms
epoch: 3 step: 596, loss is 0.0019015888683497906
epoch time: 4792.765 ms, per step time: 8.042 ms
epoch: 4 step: 596, loss is 0.003520732745528221
epoch time: 4859.476 ms, per step time: 8.153 ms
train success


## 6. 测试评估

In [18]:
checkpoint_path = './ckpt/train_textcnn-4_596.ckpt'

In [None]:
checkpoint_path = './ckpt/train_textcnn_1-5_745.ckpt'#调整epoch大小

In [19]:
checkpoint_path = './ckpt/train_textcnn_2-4_596.ckpt'#调整收缩阶段lr

In [20]:
dataset = instance.create_test_dataset(batch_size=cfg.batch_size)
opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), 
              learning_rate=0.001, weight_decay=cfg.weight_decay)
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True)
net = TextCNN(vocab_len=instance.get_dict_len(),word_len=cfg.word_len,
                  num_classes=cfg.num_classes,vec_length=cfg.vec_length)

if checkpoint_path is not None:
    param_dict = load_checkpoint(checkpoint_path)
    print("load checkpoint from [{}].".format(checkpoint_path))
else:
    param_dict = load_checkpoint(cfg.checkpoint_path)
    print("load checkpoint from [{}].".format(cfg.checkpoint_path))

load_param_into_net(net, param_dict)#根据checkpoint文件加载网络参数
net.set_train(False)#将网络参数固定，防止验证时改动网络参数造成问题
model = Model(net, loss_fn=loss, metrics={'acc': Accuracy()})

acc = model.eval(dataset)
print("accuracy: ", acc)

load checkpoint from [./ckpt/train_textcnn_2-4_596.ckpt].
accuracy:  {'acc': 0.7666015625}


## 7. 在线测试

In [22]:
def preprocess(sentence):
    sentence = sentence.lower().strip()
    sentence = sentence.replace('\n','')\
                                    .replace('"','')\
                                    .replace('\'','')\
                                    .replace('.','')\
                                    .replace(',','')\
                                    .replace('[','')\
                                    .replace(']','')\
                                    .replace('(','')\
                                    .replace(')','')\
                                    .replace(':','')\
                                    .replace('--','')\
                                    .replace('-',' ')\
                                    .replace('\\','')\
                                    .replace('0','')\
                                    .replace('1','')\
                                    .replace('2','')\
                                    .replace('3','')\
                                    .replace('4','')\
                                    .replace('5','')\
                                    .replace('6','')\
                                    .replace('7','')\
                                    .replace('8','')\
                                    .replace('9','')\
                                    .replace('`','')\
                                    .replace('=','')\
                                    .replace('$','')\
                                    .replace('/','')\
                                    .replace('*','')\
                                    .replace(';','')\
                                    .replace('<b>','')\
                                    .replace('%','')\
                                    .replace("  "," ")
    sentence = sentence.split(' ')
    maxlen = cfg.word_len
    vector = [0]*maxlen
    for index, word in enumerate(sentence):
        if index >= maxlen:
            break
        if word not in instance.Vocab.keys():
            print(word,"单词未出现在字典中")
        else:
            vector[index] = instance.Vocab[word]
    sentence = vector

    return sentence

def inference(review_en):
    review_en = preprocess(review_en)
    input_en = Tensor(np.array([review_en]).astype(np.int32))
    output = net(input_en)
    if np.argmax(np.array(output[0])) == 1:
        print("Positive comments")
    else:
        print("Negative comments")

In [23]:
review_en = "the movie is so boring"
inference(review_en)

Negative comments


In [25]:
review_en = "the movie is so happy"
inference(review_en)

Positive comments
