# TextCNN
TextCNN利用CNN（卷积神经网络）进行文本特征抽取，不同大小的卷积核分别抽取n-gram特征，卷积计算出的特征图经过MaxPooling保留最大的特征值，然后将拼接成一个向量作为文本的表示。

这里我们基于TextCNN原始论文的设定，分别采用了100个大小为2,3,4的卷积核，最后得到的文本向量大小为100*3=300维。

In [1]:
!pip install transformers scikit-learn --user

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
!df -h

Filesystem               Size  Used Avail Use% Mounted on
overlay                  8.0G  254M  7.8G   4% /
tmpfs                     64M     0   64M   0% /dev
tmpfs                     15G     0   15G   0% /sys/fs/cgroup
/dev/mapper/ubuntu-root  150G   31G  120G  21% /dev/init
:/export/smodsoz9        4.9G   20M  4.6G   1% /storage
:/export/datasets        2.0T  673G  1.3T  36% /datasets
shm                       12G     0   12G   0% /dev/shm
tmpfs                     15G   12K   15G   1% /proc/driver/nvidia
tmpfs                    3.0G  298M  2.7G  10% /run/nvidia-persistenced/socket
udev                      15G     0   15G   0% /dev/nvidia0
tmpfs                     15G     0   15G   0% /proc/asound
tmpfs                     15G     0   15G   0% /proc/acpi
tmpfs                     15G     0   15G   0% /proc/scsi
tmpfs                     15G     0   15G   0% /sys/firmware


In [3]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            29G        1.6G         18G        298M        9.4G         27G
Swap:            0B          0B          0B


In [4]:
import logging
import random

import numpy as np
import torch
# 日志输出配置
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# 种子值
seed = 666

# 改变随机数生成器的种子
random.seed(seed)
# 指定随机数生成时所用算法开始的整数值
np.random.seed(seed)
# 为当前GPU设置随机种子；如果使用多个GPU，应该使用
torch.cuda.manual_seed(seed)
# 为CPU设置种子用于生成随机数，以使得结果是确定的
torch.manual_seed(seed)

# 设置cuda
gpu = 0
# 如果给定gpu数以及gpu是否可用，来判定是否使用cuda
use_cuda = gpu >= 0 and torch.cuda.is_available()

# 如果使用cuda，设置成gpu
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda", gpu)
else:
# 如果不使用cuda，设置成cpu
    device = torch.device("cpu")

# 日志输出计算资源    
logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)

2020-07-31 13:10:57,490 INFO: Use cuda: True, gpu id: 0.


In [5]:
# 将数据分割成10陇
fold_num = 10

# 训练数据文件
data_file = '../input/train_set.csv'

import pandas as pd

In [6]:
# 把所有数据转换到陇
def all_data2fold(fold_num, num=200000):
    # 定义返回用fold_data
    fold_data = []
    
    # 使用TAB做分隔符，编码UTF-8读取数据文件
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    # 从读取的文件数据中，截取text字段（num条记录）
    texts = f['text'].tolist()[:num]
    
    # 从读取的文件数据中，截取label字段（num条记录）
    labels = f['label'].tolist()[:num]

    # 记录截取到的标签总数
    total = len(labels)

    # 根据记录的总数，生成有序的索引数组
    index = list(range(total))
    
    # 对有序索引进行洗牌，使其变成无序
    np.random.shuffle(index)
    
    # 为保存无序text和label，定义下面两个变量
    all_texts = []
    all_labels = []
    
    # 利用无序索引，从读到的文件数据，依次放入
    # 无序的数组变量中
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])
    
    label2id = {}# 定义label2id字典
    
    # 从0开始到total，给各个key：val赋值成label：0～total
    for i in range(total):
        # 从无序的all_labels中取索引是i的label，并字符串化
        label = str(all_labels[i])
        
        '''
        收集所有标签是label的索引，到此标签下面
        '''
        
        # 如果此label作为key在label2id字典中，不存在的话：
        if label not in label2id:
            
            # label2id字典的label作为key，更改成[i]数组作为值
            label2id[label] = [i]
        else:
        # 如果已经存在的话，在既存数组后追加i
            # label2id字典的label作为key，更改成[i]作为值
            label2id[label].append(i)
    
    # 根据陇数fold_num，分陇存储各个索引值
    all_index = [[] for _ in range(fold_num)]
    
    # 遍历label2id字典所有项目，进行处理
    for label, data in label2id.items():
        # print(label, len(data))
        
        # 根据标签相对应的索引数组数 除以陇数，向下取整后；得到批量尺寸
        batch_size = int(len(data) / fold_num)
        
        # 由总索引数组数，减去（批量尺寸 * 陇数）；得到溢出的索引数量
        other = len(data) - batch_size * fold_num
        
        # 从0开始fold_num陇数，为每一陇分配数据
        for i in range(fold_num):
            
            # 得到当前批量尺寸
            # ：如果当前陇序号 >= 溢出数量，批量尺寸不变
            # ：如果当前陇序号 <  溢出数量，批量尺寸 + 1
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            
            # 按批量尺寸，分批放入对应的陇数组中
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            
            # i陇的all_index[i]末尾，一次性追加batch_data序列值
            all_index[i].extend(batch_data)
    
    # 根据总标签数，陇数，来确定批量尺寸
    batch_size = int(total / fold_num)
    # 定义溢出text数组
    other_texts = []
    # 定义溢出label数组
    other_labels = []
    # 定义溢出数
    other_num = 0
    
    start = 0
    for fold in range(fold_num):
        
        # 获取每一陇存储的索引数组的尺寸
        num = len(all_index[fold])
        
        # 从每一陇索引数组，取出无序text中的所有text
        texts = [all_texts[i] for i in all_index[fold]]
        
        # 从每一陇索引数组，取出无序label中的所有label
        labels = [all_labels[i] for i in all_index[fold]]
        
        # 如果每一陇存储的索引数量 大于批量尺寸的话：
        if num > batch_size:
            # 截取到批量尺寸的texts，放到陇text
            fold_texts = texts[:batch_size]
            
            # 溢出批量尺寸的texts，放到溢出text
            other_texts.extend(texts[batch_size:])
            
            # 截取到批量尺寸的labels，放到陇label
            fold_labels = labels[:batch_size]
            
            # 溢出批量尺寸的labels，放到溢出label
            other_labels.extend(labels[batch_size:])
            
            # 收集每一陇的溢出数，进行累计
            other_num += num - batch_size
        elif num < batch_size:
        # 如果每一陇存储的索引数量 小于批量尺寸的话：
            
            # 设置结束索引
            end = start + batch_size - num
            
            # 把当前texts再加上溢出部分的起始结束位置数据，赋值给陇text
            fold_texts = texts + other_texts[start: end]
            
            # 把当前labels再加上溢出部分的起始位置数据，赋值给陇label
            fold_labels = labels + other_labels[start: end]
            
            # 结束位置变成下一次的起始位置
            start = end
        else:
        # 如果每一陇存储的索引数量 等于批量尺寸的话：
            # texts和labels原封不动，赋值到陇text和陇label中
            fold_texts = texts
            fold_labels = labels
            
        # 断言批量尺寸 等于 陇标签尺寸；
        assert batch_size == len(fold_labels)

        # 根据此批量尺寸，生成有序索引数组
        index = list(range(batch_size))
        # 索引重新洗牌
        np.random.shuffle(index)
        
        # 定义洗过牌的陇text和陇label
        shuffle_fold_texts = []
        shuffle_fold_labels = []
        
        # 并遍历对洗过牌的陇text和陇label进行赋值
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])
        
        # 对洗过牌的text和label，一起合并到data中
        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        
        # 并追加到返回值
        fold_data.append(data)
        
    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
    # 返回fold_data
    return fold_data

fold_data = all_data2fold(10)

2020-07-31 13:11:05,746 INFO: Fold lens [20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000, 20000]


In [7]:
# 建立训练，开发，测试数据集

# 陇ID <-- 9
fold_id = 9

# 把最后一陇数据作为开发数据
dev_data = fold_data[fold_id]

# 训练数据
train_texts = []
train_labels = []

# 将不是最后一陇的数据作为训练集数据
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    train_labels.extend(data['label'])

# 合并label和text到训练数据集
train_data = {'label': train_labels, 'text': train_texts}

# 指定测试文件
test_data_file = '../input/test_a.csv'

# 使用TAB做分隔符，编码UTF-8读取数据文件
f = pd.read_csv(test_data_file, sep='\t', encoding='UTF-8')

# 读取所有text数据
texts = f['text'].tolist()

# 按初始化为0的标签数组，合并text到测试数据集
test_data = {'label': [0] * len(texts), 'text': texts}

In [8]:
!pip3 install "transformers==2.3.0"
!pip3 install "urllib3==1.21.1"
!pip3 install "requests==2.21.0"
!pip3 install "chardet==3.0.2"
!pip3 install "certifi==2019.11.28"
!pip3 install "regex==2019.12.20"
!pip3 install "sacremoses==0.0.38"
!pip3 install "sentencepiece==0.1.85"

You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [9]:
# build vocab
import transformers
from collections import Counter
from transformers import BasicTokenizer

basic_tokenizer = BasicTokenizer()

'''
语料库类
'''
class Vocab():
    def __init__(self, train_data):
        # 最小个数
        self.min_count = 5
        # 补充个数
        self.pad = 0
        self.unk = 1
        # id to word的数组
        self._id2word = ['[PAD]', '[UNK]']
        # id to extword的数组
        self._id2extword = ['[PAD]', '[UNK]']

        # id to label
        self._id2label = []
        # 目标名字
        self.target_names = []
        # 使用训练集建立语料库方法
        self.build_vocab(train_data)
        
        # 为每一组传入的数组，设立一个reverse函数进行key：val（数组元素：序号）
        reverse = lambda x: dict(zip(x, range(len(x))))
        
        # 为self._id2word设置字典
        # 如：{'[PAD]': 0, '[UNK]': 1, 'x': 2}
        self._word2id = reverse(self._id2word)
        # 为self._id2label设置字典
        # 如：{}
        self._label2id = reverse(self._id2label)

        logging.info("Build vocab: words %d, labels %d." % (self.word_size, self.label_size))

    def build_vocab(self, data):
        
        # 暂存Counter函数
        self.word_counter = Counter()

        # 遍历text的每个text
        for text in data['text']:
            # text数据格式：数字1 数字2 数字3 数字n
            # 将text用空格拆分成words数组
            words = text.split()
            
            # 遍历数组的每个word
            for word in words:
                # 计算词频
                self.word_counter[word] += 1
        # 遍历所有词，词：词频
        for word, count in self.word_counter.most_common():
            
            # 如果词频 >= 最小数的话：
            if count >= self.min_count:
                
                # 追加word到id2word
                self._id2word.append(word)
        # 标签对名字
        label2name = {0: '科技', 1: '股票', 2: '体育', 3: '娱乐', 4: '时政', 5: '社会', 6: '教育', 7: '财经',
                      8: '家居', 9: '游戏', 10: '房产', 11: '时尚', 12: '彩票', 13: '星座'}

        # 统计每一种标签出现次数
        self.label_counter = Counter(data['label'])

        # 遍历每一个标签
        for label in range(len(self.label_counter)):
            # 获取当前标签出现的次数
            count = self.label_counter[label]
        
            # 追加当前标签到ld2label
            self._id2label.append(label)
            
            # 追加当前标签对应的名字到target_names
            self.target_names.append(label2name[label])

    def load_pretrained_embs(self, embfile):
        
        # 打开词向量文件
        with open(embfile, encoding='utf-8') as f:
            # 读取多行
            lines = f.readlines()
            # 分割首行数据
            items = lines[0].split()
            
            # 获取词频和词向量
            word_count, embedding_dim = int(items[0]), int(items[1])
        
        # 扩展词长度
        index = len(self._id2extword)
        
        # 初始化所有词向量
        embeddings = np.zeros((word_count + index, embedding_dim))
        for line in lines[1:]:
            values = line.split()
            self._id2extword.append(values[0])
            vector = np.array(values[1:], dtype='float64')
            embeddings[self.unk] += vector
            embeddings[index] = vector
            index += 1

        embeddings[self.unk] = embeddings[self.unk] / word_count
        embeddings = embeddings / np.std(embeddings)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._extword2id = reverse(self._id2extword)

        assert len(set(self._id2extword)) == len(self._id2extword)

        return embeddings

    def word2id(self, xs):
        # 如果xs是数组的话
        if isinstance(xs, list):
            # 返回word：id的值
            return [self._word2id.get(x, self.unk) for x in xs]
        return self._word2id.get(xs, self.unk)

    def extword2id(self, xs):
        # 如果xs是数组的话
        if isinstance(xs, list):
           # 返回word：id的值
            return [self._extword2id.get(x, self.unk) for x in xs]
        return self._extword2id.get(xs, self.unk)

    def label2id(self, xs):
        # 如果xs是数组的话
        if isinstance(xs, list):
           # label：id的值
            return [self._label2id.get(x, self.unk) for x in xs]
        return self._label2id.get(xs, self.unk)

    @property
    def word_size(self):
        # 返回词长度
        return len(self._id2word)

    @property
    def extword_size(self):
        # 返回扩展词长度
        return len(self._id2extword)

    @property
    def label_size(self):
        # 返回标签长度
        return len(self._id2label)


vocab = Vocab(train_data)

2020-07-31 13:11:18,707 INFO: PyTorch version 1.2.0.dev20190805 available.
2020-07-31 13:12:33,252 INFO: Build vocab: words 5978, labels 14.


In [10]:
# 搭建模型
import torch.nn as nn
import torch.nn.functional as F

# 注意力类
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.weight.data.normal_(mean=0.0, std=0.05)

        self.bias = nn.Parameter(torch.Tensor(hidden_size))
        b = np.zeros(hidden_size, dtype=np.float32)
        self.bias.data.copy_(torch.from_numpy(b))

        self.query = nn.Parameter(torch.Tensor(hidden_size))
        self.query.data.normal_(mean=0.0, std=0.05)

    def forward(self, batch_hidden, batch_masks):
        # batch_hidden: b x len x hidden_size (2 * hidden_size of lstm)
        # batch_masks:  b x len
        # linear
        key = torch.matmul(batch_hidden, self.weight) + self.bias  # b x len x hidden

        # compute attention
        outputs = torch.matmul(key, self.query)  # b x len

        masked_outputs = outputs.masked_fill((1 - batch_masks).bool(), float(-1e32))

        attn_scores = F.softmax(masked_outputs, dim=1)  # b x len

        # 对于全零向量，-1e32的结果为 1/len, -inf为nan, 额外补0
        masked_attn_scores = attn_scores.masked_fill((1 - batch_masks).bool(), 0.0)

        # sum weighted sources
        batch_outputs = torch.bmm(masked_attn_scores.unsqueeze(1), key).squeeze(1)  # b x hidden

        return batch_outputs, attn_scores

In [11]:
# build word encoder
word2vec_path = '../emb/word2vec.txt'
dropout = 0.15

class WordCNNEncoder(nn.Module):
    def __init__(self, vocab):
        super(WordCNNEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.word_dims = 100

        self.word_embed = nn.Embedding(vocab.word_size, self.word_dims, padding_idx=0)

        extword_embed = vocab.load_pretrained_embs(word2vec_path)
        extword_size, word_dims = extword_embed.shape
        logging.info("Load extword embed: words %d, dims %d." % (extword_size, word_dims))

        self.extword_embed = nn.Embedding(extword_size, word_dims, padding_idx=0)
        self.extword_embed.weight.data.copy_(torch.from_numpy(extword_embed))
        self.extword_embed.weight.requires_grad = False

        input_size = self.word_dims

        self.filter_sizes = [2, 3, 4]  # n-gram window
        self.out_channel = 100
        self.convs = nn.ModuleList([nn.Conv2d(1, self.out_channel, (filter_size, input_size), bias=True)
                                    for filter_size in self.filter_sizes])
        
    def forward(self, word_ids, extword_ids):
        # word_ids: sen_num x sent_len
        # extword_ids: sen_num x sent_len
        # batch_masks: sen_num x sent_len
        sen_num, sent_len = word_ids.shape

        word_embed = self.word_embed(word_ids)  # sen_num x sent_len x 100
        extword_embed = self.extword_embed(extword_ids)
        batch_embed = word_embed + extword_embed

        if self.training:
            batch_embed = self.dropout(batch_embed)

        batch_embed.unsqueeze_(1)  # sen_num x 1 x sent_len x 100

        pooled_outputs = []
        for i in range(len(self.filter_sizes)):
            filter_height = sent_len - self.filter_sizes[i] + 1
            conv = self.convs[i](batch_embed)
            hidden = F.relu(conv)  # sen_num x out_channel x filter_height x 1

            mp = nn.MaxPool2d((filter_height, 1))  # (filter_height, filter_width)
            pooled = mp(hidden).reshape(sen_num,
                                        self.out_channel)  # sen_num x out_channel x 1 x 1 -> sen_num x out_channel

            pooled_outputs.append(pooled)

        reps = torch.cat(pooled_outputs, dim=1)  # sen_num x total_out_channel

        if self.training:
            reps = self.dropout(reps)

        return reps
    

In [12]:
# build sent encoder
sent_hidden_size = 256
sent_num_layers = 2

class SentEncoder(nn.Module):
    def __init__(self, sent_rep_size):
        super(SentEncoder, self).__init__()
        self.dropout = nn.Dropout(dropout)

        self.sent_lstm = nn.LSTM(
            input_size=sent_rep_size,
            hidden_size=sent_hidden_size,
            num_layers=sent_num_layers,
            batch_first=True,
            bidirectional=True
        )

    def forward(self, sent_reps, sent_masks):
        # sent_reps:  b x doc_len x sent_rep_size
        # sent_masks: b x doc_len

        sent_hiddens, _ = self.sent_lstm(sent_reps)  # b x doc_len x hidden*2
        sent_hiddens = sent_hiddens * sent_masks.unsqueeze(2)

        if self.training:
            sent_hiddens = self.dropout(sent_hiddens)

        return sent_hiddens

In [13]:
# build model
class Model(nn.Module):
    def __init__(self, vocab):
        super(Model, self).__init__()
        self.sent_rep_size = 300
        self.doc_rep_size = sent_hidden_size * 2
        self.all_parameters = {}
        parameters = []
        self.word_encoder = WordCNNEncoder(vocab)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.word_encoder.parameters())))

        self.sent_encoder = SentEncoder(self.sent_rep_size)
        self.sent_attention = Attention(self.doc_rep_size)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_encoder.parameters())))
        parameters.extend(list(filter(lambda p: p.requires_grad, self.sent_attention.parameters())))

        self.out = nn.Linear(self.doc_rep_size, vocab.label_size, bias=True)
        parameters.extend(list(filter(lambda p: p.requires_grad, self.out.parameters())))

        if use_cuda:
            self.to(device)

        if len(parameters) > 0:
            self.all_parameters["basic_parameters"] = parameters

        logging.info('Build model with cnn word encoder, lstm sent encoder.')

        para_num = sum([np.prod(list(p.size())) for p in self.parameters()])
        logging.info('Model param num: %.2f M.' % (para_num / 1e6))

    def forward(self, batch_inputs):
        # batch_inputs(batch_inputs1, batch_inputs2): b x doc_len x sent_len
        # batch_masks : b x doc_len x sent_len
        batch_inputs1, batch_inputs2, batch_masks = batch_inputs
        batch_size, max_doc_len, max_sent_len = batch_inputs1.shape[0], batch_inputs1.shape[1], batch_inputs1.shape[2]
        batch_inputs1 = batch_inputs1.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len
        batch_inputs2 = batch_inputs2.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len
        batch_masks = batch_masks.view(batch_size * max_doc_len, max_sent_len)  # sen_num x sent_len

        sent_reps = self.word_encoder(batch_inputs1, batch_inputs2)  # sen_num x sent_rep_size

        sent_reps = sent_reps.view(batch_size, max_doc_len, self.sent_rep_size)  # b x doc_len x sent_rep_size
        batch_masks = batch_masks.view(batch_size, max_doc_len, max_sent_len)  # b x doc_len x max_sent_len
        sent_masks = batch_masks.bool().any(2).float()  # b x doc_len

        sent_hiddens = self.sent_encoder(sent_reps, sent_masks)  # b x doc_len x doc_rep_size
        doc_reps, atten_scores = self.sent_attention(sent_hiddens, sent_masks)  # b x doc_rep_size

        batch_outputs = self.out(doc_reps)  # b x num_labels

        return batch_outputs


model = Model(vocab)

2020-07-31 13:12:33,547 INFO: Load extword embed: words 5978, dims 100.
2020-07-31 13:12:36,667 INFO: Build model with cnn word encoder, lstm sent encoder.
2020-07-31 13:12:36,669 INFO: Model param num: 4.28 M.


In [14]:
# build optimizer
learning_rate = 2e-4
decay = .75
decay_step = 1000


class Optimizer:
    def __init__(self, model_parameters):
        self.all_params = []
        self.optims = []
        self.schedulers = []

        for name, parameters in model_parameters.items():
            if name.startswith("basic"):
                optim = torch.optim.Adam(parameters, lr=learning_rate)
                self.optims.append(optim)

                l = lambda step: decay ** (step // decay_step)
                scheduler = torch.optim.lr_scheduler.LambdaLR(optim, lr_lambda=l)
                self.schedulers.append(scheduler)
                self.all_params.extend(parameters)

            else:
                Exception("no nameed parameters.")

        self.num = len(self.optims)

    def step(self):
        for optim, scheduler in zip(self.optims, self.schedulers):
            optim.step()
            scheduler.step()
            optim.zero_grad()

    def zero_grad(self):
        for optim in self.optims:
            optim.zero_grad()

    def get_lr(self):
        lrs = tuple(map(lambda x: x.get_lr()[-1], self.schedulers))
        lr = ' %.5f' * self.num
        res = lr % lrs
        return res

In [15]:
# build dataset
def sentence_split(text, vocab, max_sent_len=256, max_segment=16):
    words = text.strip().split()
    document_len = len(words)

    index = list(range(0, document_len, max_sent_len))
    index.append(document_len)

    segments = []
    for i in range(len(index) - 1):
        segment = words[index[i]: index[i + 1]]
        assert len(segment) > 0
        segment = [word if word in vocab._id2word else '<UNK>' for word in segment]
        segments.append([len(segment), segment])

    assert len(segments) > 0
    if len(segments) > max_segment:
        segment_ = int(max_segment / 2)
        return segments[:segment_] + segments[-segment_:]
    else:
        return segments


def get_examples(data, vocab, max_sent_len=256, max_segment=8):
    label2id = vocab.label2id
    examples = []

    for text, label in zip(data['text'], data['label']):
        # label
        id = label2id(label)

        # words
        sents_words = sentence_split(text, vocab, max_sent_len, max_segment)
        doc = []
        for sent_len, sent_words in sents_words:
            word_ids = vocab.word2id(sent_words)
            extword_ids = vocab.extword2id(sent_words)
            doc.append([sent_len, word_ids, extword_ids])
        examples.append([id, len(doc), doc])

    logging.info('Total %d docs.' % len(examples))
    return examples

In [16]:
# build loader

def batch_slice(data, batch_size):
    batch_num = int(np.ceil(len(data) / float(batch_size)))
    for i in range(batch_num):
        cur_batch_size = batch_size if i < batch_num - 1 else len(data) - batch_size * i
        docs = [data[i * batch_size + b] for b in range(cur_batch_size)]

        yield docs

def data_iter(data, batch_size, shuffle=True, noise=1.0):
    """
    randomly permute data, then sort by source length, and partition into batches
    ensure that the length of  sentences in each batch
    """

    batched_data = []
    if shuffle:
        np.random.shuffle(data)

    lengths = [example[1] for example in data]
    noisy_lengths = [- (l + np.random.uniform(- noise, noise)) for l in lengths]
    sorted_indices = np.argsort(noisy_lengths).tolist()
    sorted_data = [data[i] for i in sorted_indices]

    batched_data.extend(list(batch_slice(sorted_data, batch_size)))

    if shuffle:
        np.random.shuffle(batched_data)

    for batch in batched_data:
        yield batch

In [17]:
# some function
from sklearn.metrics import f1_score, precision_score, recall_score


def get_score(y_ture, y_pred):
    y_ture = np.array(y_ture)
    y_pred = np.array(y_pred)
    f1 = f1_score(y_ture, y_pred, average='macro') * 100
    p = precision_score(y_ture, y_pred, average='macro') * 100
    r = recall_score(y_ture, y_pred, average='macro') * 100

    return str((reformat(p, 2), reformat(r, 2), reformat(f1, 2))), reformat(f1, 2)


def reformat(num, n):
    return float(format(num, '0.' + str(n) + 'f'))

In [18]:
# build trainer

import time
from sklearn.metrics import classification_report

clip = 5.0
epochs = 1
early_stops = 3
log_interval = 50

test_batch_size = 128
train_batch_size = 128

save_model = '../output/cnn.bin'
save_test = '../output/cnn.csv'
save_pred = '../output/submitCNN.csv'
class Trainer():
    def __init__(self, model, vocab):
        self.model = model
        self.report = True

        self.train_data = get_examples(train_data, vocab)
        self.batch_num = int(np.ceil(len(self.train_data) / float(train_batch_size)))
        self.dev_data = get_examples(dev_data, vocab)
        self.test_data = get_examples(test_data, vocab)

        # criterion
        self.criterion = nn.CrossEntropyLoss()

        # label name
        self.target_names = vocab.target_names

        # optimizer
        self.optimizer = Optimizer(model.all_parameters)

        # count
        self.step = 0
        self.early_stop = -1
        self.best_train_f1, self.best_dev_f1 = 0, 0
        self.last_epoch = epochs

    def train(self):
        logging.info('Start training...')
        for epoch in range(1, epochs + 1):
            train_f1 = self._train(epoch)

            dev_f1 = self._eval(epoch)

            if self.best_dev_f1 <= dev_f1:
                logging.info(
                    "Exceed history dev = %.2f, current dev = %.2f" % (self.best_dev_f1, dev_f1))
                torch.save(self.model.state_dict(), save_model)

                self.best_train_f1 = train_f1
                self.best_dev_f1 = dev_f1
                self.early_stop = 0
            else:
                self.early_stop += 1
                if self.early_stop == early_stops:
                    logging.info(
                        "Eearly stop in epoch %d, best train: %.2f, dev: %.2f" % (
                            epoch - early_stops, self.best_train_f1, self.best_dev_f1))
                    self.last_epoch = epoch
                    break

    def test(self):
        self.model.load_state_dict(torch.load(save_model))
        self._eval(self.last_epoch + 1, test=True)
        
    def pred(self):
        self.model.load_state_dict(torch.load(save_model))
        start_time = time.time()

        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(self.test_data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = self.batch2tensor(batch_data)
                batch_outputs = self.model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, f1 = get_score(y_true, y_pred)

            during_time = time.time() - start_time
            
            df = pd.DataFrame({'label': y_pred})
            df.to_csv(save_test,index=None)

    
    def _train(self, epoch):
        self.optimizer.zero_grad()
        self.model.train()

        start_time = time.time()
        epoch_start_time = time.time()
        overall_losses = 0
        losses = 0
        batch_idx = 1
        y_pred = []
        y_true = []
        for batch_data in data_iter(self.train_data, train_batch_size, shuffle=True):
            torch.cuda.empty_cache()
            batch_inputs, batch_labels = self.batch2tensor(batch_data)
            batch_outputs = self.model(batch_inputs)
            loss = self.criterion(batch_outputs, batch_labels)
            loss.backward()

            loss_value = loss.detach().cpu().item()
            losses += loss_value
            overall_losses += loss_value

            y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
            y_true.extend(batch_labels.cpu().numpy().tolist())

            nn.utils.clip_grad_norm_(self.optimizer.all_params, max_norm=clip)
            for optimizer, scheduler in zip(self.optimizer.optims, self.optimizer.schedulers):
                optimizer.step()
                scheduler.step()
            self.optimizer.zero_grad()

            self.step += 1

            if batch_idx % log_interval == 0:
                elapsed = time.time() - start_time

                lrs = self.optimizer.get_lr()
                logging.info(
                    '| epoch {:3d} | step {:3d} | batch {:3d}/{:3d} | lr{} | loss {:.4f} | s/batch {:.2f}'.format(
                        epoch, self.step, batch_idx, self.batch_num, lrs,
                        losses / log_interval,
                        elapsed / log_interval))

                losses = 0
                start_time = time.time()

            batch_idx += 1

        overall_losses /= self.batch_num
        during_time = time.time() - epoch_start_time

        # reformat
        overall_losses = reformat(overall_losses, 4)
        score, f1 = get_score(y_true, y_pred)

        logging.info(
            '| epoch {:3d} | score {} | f1 {} | loss {:.4f} | time {:.2f}'.format(epoch, score, f1,
                                                                                  overall_losses,
                                                                                  during_time))
        if set(y_true) == set(y_pred) and self.report:
            report = classification_report(y_true, y_pred, digits=4, target_names=self.target_names)
            logging.info('\n' + report)

        return f1

    def _eval(self, epoch, test=False):
        self.model.eval()
        start_time = time.time()

        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_data in data_iter(self.dev_data, test_batch_size, shuffle=False):
                torch.cuda.empty_cache()
                batch_inputs, batch_labels = self.batch2tensor(batch_data)
                batch_outputs = self.model(batch_inputs)
                y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist())
                y_true.extend(batch_labels.cpu().numpy().tolist())

            score, f1 = get_score(y_true, y_pred)

            during_time = time.time() - start_time
            
            if test:
                df = pd.DataFrame({'label': y_pred})
                df.to_csv(save_test, index=False, sep=',')
            else:
                logging.info(
                    '| epoch {:3d} | dev | score {} | f1 {} | time {:.2f}'.format(epoch, score, f1,
                                                                              during_time))
                if set(y_true) == set(y_pred) and self.report:
                    report = classification_report(y_true, y_pred, digits=4, target_names=self.target_names)
                    logging.info('\n' + report)

        return f1

    def batch2tensor(self, batch_data):
        '''
            [[label, doc_len, [[sent_len, [sent_id0, ...], [sent_id1, ...]], ...]]
        '''
        batch_size = len(batch_data)
        doc_labels = []
        doc_lens = []
        doc_max_sent_len = []
        for doc_data in batch_data:
            doc_labels.append(doc_data[0])
            doc_lens.append(doc_data[1])
            sent_lens = [sent_data[0] for sent_data in doc_data[2]]
            max_sent_len = max(sent_lens)
            doc_max_sent_len.append(max_sent_len)

        max_doc_len = max(doc_lens)
        max_sent_len = max(doc_max_sent_len)

        batch_inputs1 = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64)
        batch_inputs2 = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.int64)
        batch_masks = torch.zeros((batch_size, max_doc_len, max_sent_len), dtype=torch.float32)
        batch_labels = torch.LongTensor(doc_labels)

        for b in range(batch_size):
            for sent_idx in range(doc_lens[b]):
                sent_data = batch_data[b][2][sent_idx]
                for word_idx in range(sent_data[0]):
                    batch_inputs1[b, sent_idx, word_idx] = sent_data[1][word_idx]
                    batch_inputs2[b, sent_idx, word_idx] = sent_data[2][word_idx]
                    batch_masks[b, sent_idx, word_idx] = 1

        if use_cuda:
            batch_inputs1 = batch_inputs1.to(device)
            batch_inputs2 = batch_inputs2.to(device)
            batch_masks = batch_masks.to(device)
            batch_labels = batch_labels.to(device)

        return (batch_inputs1, batch_inputs2, batch_masks), batch_labels

In [19]:
# train
trainer = Trainer(model, vocab)

2020-07-31 13:29:48,908 INFO: Total 180000 docs.
2020-07-31 13:31:44,762 INFO: Total 20000 docs.
2020-07-31 13:36:29,563 INFO: Total 50000 docs.


In [20]:
trainer.train()

2020-07-31 13:36:29,569 INFO: Start training...
2020-07-31 13:39:59,827 INFO: | epoch   1 | step  50 | batch  50/1407 | lr 0.00020 | loss 2.2970 | s/batch 4.21
2020-07-31 13:43:06,401 INFO: | epoch   1 | step 100 | batch 100/1407 | lr 0.00020 | loss 1.8791 | s/batch 3.73
2020-07-31 13:46:02,659 INFO: | epoch   1 | step 150 | batch 150/1407 | lr 0.00020 | loss 1.2586 | s/batch 3.53
2020-07-31 13:48:50,489 INFO: | epoch   1 | step 200 | batch 200/1407 | lr 0.00020 | loss 1.0432 | s/batch 3.36
2020-07-31 13:51:48,847 INFO: | epoch   1 | step 250 | batch 250/1407 | lr 0.00020 | loss 0.9036 | s/batch 3.57
2020-07-31 13:54:26,530 INFO: | epoch   1 | step 300 | batch 300/1407 | lr 0.00020 | loss 0.8021 | s/batch 3.15
2020-07-31 13:57:27,729 INFO: | epoch   1 | step 350 | batch 350/1407 | lr 0.00020 | loss 0.7360 | s/batch 3.62
2020-07-31 14:00:44,995 INFO: | epoch   1 | step 400 | batch 400/1407 | lr 0.00020 | loss 0.6489 | s/batch 3.95
2020-07-31 14:04:03,171 INFO: | epoch   1 | step 450 | b

In [26]:
# test
#trainer.test()

In [None]:
trainer.pred()

参考：https://mlwhiz.com/blog/2019/03/09/deeplearning_architectures_text_classification/