# TextCNN
TextCNN利用CNN（卷积神经网络）进行文本特征抽取，不同大小的卷积核分别抽取n-gram特征，卷积计算出的特征图经过MaxPooling保留最大的特征值，然后将拼接成一个向量作为文本的表示。

这里我们基于TextCNN原始论文的设定，分别采用了100个大小为2,3,4的卷积核，最后得到的文本向量大小为100*3=300维。

In [1]:
!pip install transformers scikit-learn --user

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 15.1MB/s eta 0:00:01
Collecting requests (from transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/45/1e/0c169c6a5381e241ba7404532c16a21d86ab872c9bed8bdcd4c423954103/requests-2.24.0-py2.py3-none-any.whl (61kB)
[K     |████████████████████████████████| 71kB 17.0MB/s eta 0:00:01
[?25hCollecting filelock (from transformers)
  Downloading https://files.pythonhosted.org/packages/93/83/71a2ee6158bb9f39a90c0dea1637f81d5eef866e188e1971a1b1ab01a35a/filelock-3.0.12-py3-none-any.whl
Collecting sacremoses (from transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 23.8

In [2]:
!df -h

Filesystem               Size  Used Avail Use% Mounted on
overlay                  8.0G  179M  7.9G   3% /
tmpfs                     64M     0   64M   0% /dev
tmpfs                     15G     0   15G   0% /sys/fs/cgroup
/dev/mapper/ubuntu-root  150G   51G  100G  34% /dev/init
:/export/smodsoz9        4.9G   20M  4.6G   1% /storage
:/export/datasets        2.0T  673G  1.3T  36% /datasets
shm                       12G     0   12G   0% /dev/shm
tmpfs                     15G   12K   15G   1% /proc/driver/nvidia
tmpfs                    3.0G  314M  2.7G  11% /run/nvidia-persistenced/socket
udev                      15G     0   15G   0% /dev/nvidia0
tmpfs                     15G     0   15G   0% /proc/asound
tmpfs                     15G     0   15G   0% /proc/acpi
tmpfs                     15G     0   15G   0% /proc/scsi
tmpfs                     15G     0   15G   0% /sys/firmware


In [3]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            29G        1.4G         16G        314M         11G         27G
Swap:            0B          0B          0B


In [8]:
import logging
import random

import numpy as np
import torch
# 日志输出配置
logging.basicConfig(level=logging.INFO, format='%(asctime)-15s %(levelname)s: %(message)s')

# 种子值
seed = 666

# 改变随机数生成器的种子
random.seed(seed)
# 指定随机数生成时所用算法开始的整数值
np.random.seed(seed)
# 为当前GPU设置随机种子；如果使用多个GPU，应该使用
torch.cuda.manual_seed(seed)
# 为CPU设置种子用于生成随机数，以使得结果是确定的
torch.manual_seed(seed)

# 设置cuda
gpu = 0
# 如果给定gpu数以及gpu是否可用，来判定是否使用cuda
use_cuda = gpu >= 0 and torch.cuda.is_available()

# 如果使用cuda，设置成gpu
if use_cuda:
    torch.cuda.set_device(gpu)
    device = torch.device("cuda", gpu)
else:
# 如果不使用cuda，设置成cpu
    device = torch.device("cpu")

# 日志输出计算资源    
logging.info("Use cuda: %s, gpu id: %d.", use_cuda, gpu)

2020-07-30 02:05:56,926 INFO: Use cuda: True, gpu id: 0.


In [16]:
# 将数据分割成10陇
fold_num = 10

# 训练数据文件
data_file = '../input/train_set.csv'

import pandas as pd

In [44]:
# 把所有数据转换到陇
def all_data2fold(fold_num, num=10000):
    # 定义返回用fold_data
    fold_data = []
    
    # 使用TAB做分隔符，编码UTF-8读取数据文件
    f = pd.read_csv(data_file, sep='\t', encoding='UTF-8')
    # 从读取的文件数据中，截取text字段（num条记录）
    texts = f['text'].tolist()[:num]
    
    # 从读取的文件数据中，截取label字段（num条记录）
    labels = f['label'].tolist()[:num]

    # 记录截取到的标签总数
    total = len(labels)

    # 根据记录的总数，生成有序的索引数组
    index = list(range(total))
    
    # 对有序索引进行洗牌，使其变成无序
    np.random.shuffle(index)
    
    # 为保存无序text和label，定义下面两个变量
    all_texts = []
    all_labels = []
    
    # 利用无序索引，从读到的文件数据，依次放入
    # 无序的数组变量中
    for i in index:
        all_texts.append(texts[i])
        all_labels.append(labels[i])
    
    label2id = {}# 定义label2id字典
    
    # 从0开始到total，给各个key：val赋值成label：0～total
    for i in range(total):
        # 从无序的all_labels中取索引是i的label，并字符串化
        label = str(all_labels[i])
        
        '''
        收集所有标签是label的索引，到此标签下面
        '''
        
        # 如果此label作为key在label2id字典中，不存在的话：
        if label not in label2id:
            
            # label2id字典的label作为key，更改成[i]数组作为值
            label2id[label] = [i]
        else:
        # 如果已经存在的话，在既存数组后追加i
            # label2id字典的label作为key，更改成[i]作为值
            label2id[label].append(i)
    
    # 根据陇数fold_num，分陇存储各个索引值
    all_index = [[] for _ in range(fold_num)]
    
    # 遍历label2id字典所有项目，进行处理
    for label, data in label2id.items():
        # print(label, len(data))
        
        # 根据标签相对应的索引数组数 除以陇数，向下取整后；得到批量尺寸
        batch_size = int(len(data) / fold_num)
        
        # 由总索引数组数，减去（批量尺寸 * 陇数）；得到溢出的索引数量
        other = len(data) - batch_size * fold_num
        
        # 从0开始fold_num陇数，为每一陇分配数据
        for i in range(fold_num):
            
            # 得到当前批量尺寸
            # ：如果当前陇序号 >= 溢出数量，批量尺寸不变
            # ：如果当前陇序号 <  溢出数量，批量尺寸 + 1
            cur_batch_size = batch_size + 1 if i < other else batch_size
            # print(cur_batch_size)
            
            # 按批量尺寸，分批放入对应的陇数组中
            batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)]
            
            # i陇的all_index[i]末尾，一次性追加batch_data序列值
            all_index[i].extend(batch_data)
    
    # 根据总标签数，陇数，来确定批量尺寸
    batch_size = int(total / fold_num)
    # 定义溢出text数组
    other_texts = []
    # 定义溢出label数组
    other_labels = []
    # 定义溢出数
    other_num = 0
    
    start = 0
    for fold in range(fold_num):
        
        # 获取每一陇存储的索引数组的尺寸
        num = len(all_index[fold])
        
        # 从每一陇索引数组，取出无序text中的所有text
        texts = [all_texts[i] for i in all_index[fold]]
        
        # 从每一陇索引数组，取出无序label中的所有label
        labels = [all_labels[i] for i in all_index[fold]]
        
        # 如果每一陇存储的索引数量 大于批量尺寸的话：
        if num > batch_size:
            # 截取到批量尺寸的texts，放到陇text
            fold_texts = texts[:batch_size]
            
            # 溢出批量尺寸的texts，放到溢出text
            other_texts.extend(texts[batch_size:])
            
            # 截取到批量尺寸的labels，放到陇label
            fold_labels = labels[:batch_size]
            
            # 溢出批量尺寸的labels，放到溢出label
            other_labels.extend(labels[batch_size:])
            
            # 收集每一陇的溢出数，进行累计
            other_num += num - batch_size
        elif num < batch_size:
        # 如果每一陇存储的索引数量 小于批量尺寸的话：
            
            # 设置结束索引
            end = start + batch_size - num
            
            # 把当前texts再加上溢出部分的起始结束位置数据，赋值给陇text
            fold_texts = texts + other_texts[start: end]
            
            # 把当前labels再加上溢出部分的起始位置数据，赋值给陇label
            fold_labels = labels + other_labels[start: end]
            
            # 结束位置变成下一次的起始位置
            start = end
        else:
        # 如果每一陇存储的索引数量 等于批量尺寸的话：
            # texts和labels原封不动，赋值到陇text和陇label中
            fold_texts = texts
            fold_labels = labels
            
        # 断言批量尺寸 等于 陇标签尺寸；
        assert batch_size == len(fold_labels)

        # 根据此批量尺寸，生成有序索引数组
        index = list(range(batch_size))
        # 索引重新洗牌
        np.random.shuffle(index)
        
        # 定义洗过牌的陇text和陇label
        shuffle_fold_texts = []
        shuffle_fold_labels = []
        
        # 并遍历对洗过牌的陇text和陇label进行赋值
        for i in index:
            shuffle_fold_texts.append(fold_texts[i])
            shuffle_fold_labels.append(fold_labels[i])
        
        # 对洗过牌的text和label，一起合并到data中
        data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts}
        
        # 并追加到返回值
        fold_data.append(data)
        
    logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
    # 返回fold_data
    return fold_data

fold_data = all_data2fold(10)

2020-07-30 05:38:01,993 INFO: Fold lens [1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]


In [None]:
# 建立训练，开发，测试数据集

# 陇ID <-- 9
fold_id = 9

# 把最后一陇数据作为开发数据
dev_data = fold_data[fold_id]

# 训练数据
train_texts = []
train_labels = []

# 将不是最后一陇的数据作为训练集数据
for i in range(0, fold_id):
    data = fold_data[i]
    train_texts.extend(data['text'])
    train_labels.extend(data['label'])

# 合并label和text到训练数据集
train_data = {'label': train_labels, 'text': train_texts}

# 指定测试文件
test_data_file = '../input/test_a.csv'

# 使用TAB做分隔符，编码UTF-8读取数据文件
f = pd.read_csv(test_data_file, sep='\t', encoding='UTF-8')

# 读取所有text数据
texts = f['text'].tolist()

# 按初始化为0的标签数组，合并text到测试数据集
test_data = {'label': [0] * len(texts), 'text': texts}

In [None]:
# build vocab
import transformers
from collections import Counter
from transformers import BasicTokenizer

basic_tokenizer = BasicTokenizer()


class Vocab():
    def __init__(self, train_data):
        self.min_count = 5
        self.pad = 0
        self.unk = 1
        self._id2word = ['[PAD]', '[UNK]']
        self._id2extword = ['[PAD]', '[UNK]']

        self._id2label = []
        self.target_names = []

        self.build_vocab(train_data)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._word2id = reverse(self._id2word)
        self._label2id = reverse(self._id2label)

        logging.info("Build vocab: words %d, labels %d." % (self.word_size, self.label_size))

    def build_vocab(self, data):
        self.word_counter = Counter()

        for text in data['text']:
            words = text.split()
            for word in words:
                self.word_counter[word] += 1

        for word, count in self.word_counter.most_common():
            if count >= self.min_count:
                self._id2word.append(word)

        label2name = {0: '科技', 1: '股票', 2: '体育', 3: '娱乐', 4: '时政', 5: '社会', 6: '教育', 7: '财经',
                      8: '家居', 9: '游戏', 10: '房产', 11: '时尚', 12: '彩票', 13: '星座'}

        self.label_counter = Counter(data['label'])

        for label in range(len(self.label_counter)):
            count = self.label_counter[label]
            self._id2label.append(label)
            self.target_names.append(label2name[label])

    def load_pretrained_embs(self, embfile):
        with open(embfile, encoding='utf-8') as f:
            lines = f.readlines()
            items = lines[0].split()
            word_count, embedding_dim = int(items[0]), int(items[1])

        index = len(self._id2extword)
        embeddings = np.zeros((word_count + index, embedding_dim))
        for line in lines[1:]:
            values = line.split()
            self._id2extword.append(values[0])
            vector = np.array(values[1:], dtype='float64')
            embeddings[self.unk] += vector
            embeddings[index] = vector
            index += 1

        embeddings[self.unk] = embeddings[self.unk] / word_count
        embeddings = embeddings / np.std(embeddings)

        reverse = lambda x: dict(zip(x, range(len(x))))
        self._extword2id = reverse(self._id2extword)

        assert len(set(self._id2extword)) == len(self._id2extword)

        return embeddings

    def word2id(self, xs):
        if isinstance(xs, list):
            return [self._word2id.get(x, self.unk) for x in xs]
        return self._word2id.get(xs, self.unk)

    def extword2id(self, xs):
        if isinstance(xs, list):
            return [self._extword2id.get(x, self.unk) for x in xs]
        return self._extword2id.get(xs, self.unk)

    def label2id(self, xs):
        if isinstance(xs, list):
            return [self._label2id.get(x, self.unk) for x in xs]
        return self._label2id.get(xs, self.unk)

    @property
    def word_size(self):
        return len(self._id2word)

    @property
    def extword_size(self):
        return len(self._id2extword)

    @property
    def label_size(self):
        return len(self._id2label)


vocab = Vocab(train_data)

0
1
2
3
4
5
6
7
8
