## 基于卷积神经网络的文本分类实验实验

In [1]:
# coding=utf-8
import numpy as np
import re
import itertools
import ast
import json
import os
import codecs
import jieba

from collections import Counter

import tensorflow as tf
import numpy as np

import time
import datetime
from tensorflow.contrib import learn
import csv

from sklearn.utils import shuffle

In [2]:
#category = ['星座', '股票', '房产', '时尚', '体育', '社会', '家居', '游戏', '彩票', '科技', '教育', '时政', '娱乐', '财经']
category = ['体育', '股票', '科技']

In [3]:
def split_data_with_label(corpus, lens=14):
    """
    将数据划分为训练数据和样本标签
    :param corpus: 
    :return: 
    """
    input_x = []
    input_y = []

    tag = []
    if os.path.isfile(corpus):
        with codecs.open(corpus, 'r') as f:
            for line in f:
                tag.append(re.sub('[\xa0\n\r\t]+', '' , line))
                
    else:
        for docs in corpus:
            for doc in docs:
                tag.append(doc)
    tag = shuffle(tag)
    for doc in tag:
        index = doc.find(' ')
        tag= doc[:index]
        tag = re.sub('__label__', '', tag)
        y_ = np.zeros(lens)
        i = category.index(tag)
        y_[i] = 1
        input_y.append(y_)
        
        input_x.append(doc[index + 1 :])
    
    return [input_x, input_y]

In [4]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    针对训练数据，组合batch iter
    :param data:
    :param batch_size: the size of each batch
    :param num_epochs: total of epochs
    :param shuffle: 是否需要打乱数据
    :return:
    """
    data = np.array(data)
    # 样本数量
    data_size = len(data)
    # 根据batch size 计算一个epoch中的batch 数量
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
    # generates iter for dataset.
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]

In [5]:
class TextCNN(object):
    """
    CNN 文本分类的网络构建
    包括embedding layer, Convolutional layer max-pooling, softmax layer
    """

    def __init__(self, seq_length, num_classes, vocab_size, embedding_size, filter_sizes, num_filters,
                 l2_reg_lambda=0.0):
        """

        :param seq_length: 
        :param num_classses: 
        :param vocab_size: 
        :param embedding_size: 
        :param filter_sizes: 
        :param num_filters: 
        :param l2_reg_lambda: 
        """
        # 定义输入，输出，和dropout的参数
        # [样本个数，每个样本的词个数]
        self.input_x = tf.placeholder(tf.int32, [None, seq_length], name='input_x')
        # [样本个数， 类别个数]
        self.input_y = tf.placeholder(tf.float32, [None, num_classes], name='input_y')
        # dropout probability
        self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')

        # l2 正则 损失
        l2_loss = tf.constant(0.0)

        # embedding layer
        with tf.device('/cpu:0'), tf.name_scope('embedding'):
            # embedding 权重
            self.W = tf.Variable(
                tf.random_uniform([vocab_size, embedding_size], -1., 1.),
                name="W")
            # look_up embedding 后得到一个三维的tensor [None,seq_length, embedding_size]
            self.embedded_chars = tf.nn.embedding_lookup(self.W, self.input_x)
            # 将embedded_chars 向量扩充一维成一个四维向量[None,seq_length, embedding_size, 1] ,这是卷积核
            self.embedded_chars_expanded = tf.expand_dims(self.embedded_chars, -1)

        # 卷积操作和最大池化操作
        pooled_output = []
        for i, filter_size in enumerate(filter_sizes):
            with tf.name_scope('conv-maxpool-%s' % filter_size):
                # 卷积层
                # 卷积核的维度
                filter_shape = [filter_size, embedding_size, 1, num_filters]
                W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name='b')
                conv = tf.nn.conv2d(self.embedded_chars_expanded,
                                    W,
                                    strides=[1, 1, 1, 1],
                                    padding='VALID',
                                    name='conv'
                                    )
                # 使用ReLU非线性激活函数得到一个feature map
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name='relu')

                # 对刚才卷积生成的feature map 进行max-pooling，得到最大的
                pooled = tf.nn.max_pool(h,
                                        ksize=[1, seq_length - filter_size + 1, 1, 1],  # 池化窗口大小 第二个参数的意思height
                                        # 直接对featrue map 的所有进行查找最大值
                                        strides=[1, 1, 1, 1],  # 窗口在每一个维度上滑动的步长
                                        padding='VALID',
                                        name='pool')
                # 当前filter 的feature maps的池化结果拼到一起
                pooled_output.append(pooled)

        # 组合所有的feature maps 的池化结果，总个数一共是filter_size * 不同filter的个数
        # 卷积核的总个数
        num_filters_total = num_filters * len(filter_sizes)
        self.h_pool = tf.concat(pooled_output, 3)
        self.h_pool_flat = tf.reshape(self.h_pool, [-1, num_filters_total])

        # dropout layer
        with tf.name_scope('dropout'):
            self.h_drop = tf.nn.dropout(self.h_pool_flat, self.dropout_keep_prob)

        # 计算输出的概率
        with tf.name_scope('output'):
            W = tf.get_variable('W',
                                shape=[num_filters_total, num_classes],  #
                                initializer=tf.contrib.layers.xavier_initializer())
            b = tf.Variable(tf.constant(0.1, shape=[num_classes], name='b'))
            l2_loss += tf.nn.l2_loss(W)
            l2_loss += tf.nn.l2_loss(b)

            self.scores = tf.nn.xw_plus_b(self.h_drop, W, b, name='scores')
            # using softmax to normolize output
            # self.scores = tf.nn.softmax(logits=self.scores)
            # 找到概率最大的类别
            self.predictions = tf.argmax(self.scores, 1, name='predictons')

        # 计算损失
        with tf.name_scope('loss'):
            # 计算交叉熵损失
            losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.scores, labels=self.input_y)
            # 总共的损失为交叉熵均值 和 l2正则损失
            self.loss = tf.reduce_mean(losses) + l2_loss * l2_reg_lambda

        # 计算正确率
        with tf.name_scope('accuracy'):
            correct_predictions = tf.equal(self.predictions, tf.argmax(self.input_y, 1))
            self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32), name='accuracy')

In [6]:
# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("file_path", "thu_data_3class_3w", "Data source.")
tf.flags.DEFINE_integer("num_classes", 3, "number classes of datasets.")

# Model Hyperparameters
tf.flags.DEFINE_integer("embedding_dim", 200, "Dimensionality of character embedding (default: 128)")
tf.flags.DEFINE_string("filter_sizes", "3,4,5", "Comma-separated filter sizes (default: '3,4,5')")
tf.flags.DEFINE_integer("num_filters", 200, "Number of filters per filter size (default: 128)")
tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularization lambda (default: 0.0)")

# Training parameters
tf.flags.DEFINE_integer("batch_size", 32, "Batch Size (default: 64)")
tf.flags.DEFINE_integer("num_epochs", 100, "Number of training epochs (default: 200)")
tf.flags.DEFINE_integer("evaluate_every", 100, "Evaluate model on dev set after this many steps (default: 100)")
tf.flags.DEFINE_integer("checkpoint_every", 100, "Save model after this many steps (default: 100)")
tf.flags.DEFINE_integer("num_checkpoints", 5, "Number of checkpoints to store (default: 5)")
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print('\nParameters')
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")


Parameters
ALLOW_SOFT_PLACEMENT=True
BATCH_SIZE=32
CHECKPOINT_EVERY=100
DEV_SAMPLE_PERCENTAGE=0.1
DROPOUT_KEEP_PROB=0.5
EMBEDDING_DIM=200
EVALUATE_EVERY=100
FILE_PATH=thu_data_3class_3w
FILTER_SIZES=3,4,5
L2_REG_LAMBDA=0.0
LOG_DEVICE_PLACEMENT=False
NUM_CHECKPOINTS=5
NUM_CLASSES=3
NUM_EPOCHS=100
NUM_FILTERS=200



In [7]:
# 数据准备
print('load data...')
text_x, y = split_data_with_label(FLAGS.file_path, FLAGS.num_classes)

# build vocabulary
max_doc_length = 400#max([len(x.split(' ')) for x in text_x])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_doc_length)
# x是训练数据，是一个[样本数量,句子长度] 的二维数组
x = np.array(list(vocab_processor.fit_transform(text_x)))
y = np.array(y)
# Randomly shuffle data 随机打乱数据
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set  切分训练和测试数据集
# TODO: This is very crude, should use cross-validation 需要改成交叉验证的方式
# 简单的设置训练数据和测试数据的比例
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
# 选取后dev_samle_index个为测试数据，前面为训练数据
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
# 输出训练数据和测试数据的样本信息，数据集信息
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))

load data...
Vocabulary Size: 350081
Train/Dev split: 81000/9000


In [None]:
with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # 构建cnn 节点
        cnn = TextCNN(
            seq_length=x_train.shape[1],
            num_classes=y_train.shape[1],
            vocab_size=len(vocab_processor.vocabulary_),
            embedding_size=FLAGS.embedding_dim,
            filter_sizes=list(map(int, FLAGS.filter_sizes.split(","))),
            num_filters=FLAGS.num_filters,
            l2_reg_lambda=FLAGS.l2_reg_lambda)

        # Define Training procedure
        global_step = tf.Variable(0, name="global_step", trainable=False)
        # 优化算法
        optimizer = tf.train.AdamOptimizer(1e-3)
        grads_and_vars = optimizer.compute_gradients(cnn.loss)

        train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

        # Keep track of gradient values and sparsity (optional)
        grad_summaries = []
        for g, v in grads_and_vars:
            if g is not None:
                grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name),
                                                     tf.nn.zero_fraction(g))
                grad_summaries.append(grad_hist_summary)
                grad_summaries.append(sparsity_summary)
        grad_summaries_merged = tf.summary.merge(grad_summaries)

        # Output directory for models and summaries
        timestamp = str(int(time.time()))
        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
        print("Writing to {}\n".format(out_dir))

        # Summaries for loss and accuracy
        loss_summary = tf.summary.scalar("loss", cnn.loss)
        acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

        # Train Summaries
        train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
        train_summary_dir = os.path.join(out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

        # Dev summaries
        dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
        dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

        # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
        checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

        # Write vocabulary
        vocab_processor.save(os.path.join(out_dir, "vocab"))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())

        def train_step(x_batch, y_batch):
            """
            A single training step
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
            }
            # 执行 节点操作
            _, step, summaries, loss, accuracy = sess.run(
                [train_op, global_step, train_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)

            time_str = datetime.datetime.now().isoformat()
            if step % 20 == 0:
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            train_summary_writer.add_summary(summaries, step)

        def dev_step(x_batch, y_batch, writer=None):
            """
            Evaluates model on a dev set
            """
            feed_dict = {
              cnn.input_x: x_batch,
              cnn.input_y: y_batch,
              cnn.dropout_keep_prob: 1.0
            }
            step, summaries, loss, accuracy = sess.run(
                [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
            if writer:
                writer.add_summary(summaries, step)
            return loss, accuracy
        # Generate batches
        batches = batch_iter(
            list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
        # Training loop. For each batch...
        best_acc = 0.0
        best_step = 0
        for batch in batches:
            x_batch, y_batch = zip(*batch)
            train_step(x_batch, y_batch)
            # 更新全局步数
            current_step = tf.train.global_step(sess, global_step)
            # 计算评估结果
            if current_step % FLAGS.evaluate_every == 0:
                print("\nEvaluation:")
                loss_, accuracy_ = dev_step(x_dev, y_dev, writer=dev_summary_writer)
                if accuracy_ > best_acc:
                    best_acc = accuracy_
                    best_step = current_step
                print("")
            # 保存模型计算结果
            if current_step % FLAGS.checkpoint_every == 0:
                path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                print("Saved model checkpoint to {}\n".format(path))

        print('\nBset dev at {}, accuray {:g}'.format(best_step, best_acc))

INFO:tensorflow:Summary name embedding/W:0/grad/hist is illegal; using embedding/W_0/grad/hist instead.
INFO:tensorflow:Summary name embedding/W:0/grad/sparsity is illegal; using embedding/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/hist is illegal; using conv-maxpool-3/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/W:0/grad/sparsity is illegal; using conv-maxpool-3/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/hist is illegal; using conv-maxpool-3/b_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-3/b:0/grad/sparsity is illegal; using conv-maxpool-3/b_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/hist is illegal; using conv-maxpool-4/W_0/grad/hist instead.
INFO:tensorflow:Summary name conv-maxpool-4/W:0/grad/sparsity is illegal; using conv-maxpool-4/W_0/grad/sparsity instead.
INFO:tensorflow:Summary name conv-maxpool-4/b:0/grad/hist is illegal; using 

2017-12-12T10:45:01.882649: step 1300, loss 0.423205, acc 0.9375

Evaluation:
2017-12-12T10:45:25.587125: step 1300, loss 0.109129, acc 0.966889

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-1300

2017-12-12T10:46:12.348527: step 1320, loss 0.10329, acc 0.9375
2017-12-12T10:46:58.211606: step 1340, loss 0.946616, acc 0.75
2017-12-12T10:47:43.815007: step 1360, loss 0.158843, acc 0.9375
2017-12-12T10:48:29.495078: step 1380, loss 0.32011, acc 0.875
2017-12-12T10:49:14.874329: step 1400, loss 0.154785, acc 0.9375

Evaluation:
2017-12-12T10:49:38.490221: step 1400, loss 0.113806, acc 0.967333

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-1400

2017-12-12T10:50:25.280522: step 1420, loss 0.208574, acc 0.96875
2017-12-12T10:51:10.879004: step 1440, loss 0.224755, acc 0.96875
2017-12-12T10:51:56.187593: step 1460, loss 0.36077, acc 0.90625
2017-12-12T10:52:41.413304: step 1480, loss 0.246259, acc 0.96875
2017-12-12

2017-12-12T11:54:16.970811: step 2940, loss 0.0428275, acc 1
2017-12-12T11:55:02.275402: step 2960, loss 0.140087, acc 0.96875
2017-12-12T11:55:47.553151: step 2980, loss 0.0898246, acc 0.96875
2017-12-12T11:56:32.923586: step 3000, loss 0.162173, acc 0.96875

Evaluation:
2017-12-12T11:56:56.568308: step 3000, loss 0.102341, acc 0.970778

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-3000

2017-12-12T11:57:42.936454: step 3020, loss 0.0381542, acc 0.96875
2017-12-12T11:58:28.234287: step 3040, loss 0.0782641, acc 0.96875
2017-12-12T11:59:13.441237: step 3060, loss 0.0601453, acc 0.9375
2017-12-12T11:59:58.793481: step 3080, loss 0.0120661, acc 1
2017-12-12T12:00:44.345719: step 3100, loss 0.148714, acc 0.90625

Evaluation:
2017-12-12T12:01:08.099256: step 3100, loss 0.0786221, acc 0.978111

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-3100

2017-12-12T12:01:54.575748: step 3120, loss 0.0154333, acc 1
2017-12-1

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-4600

2017-12-12T13:04:52.845266: step 4620, loss 0.150601, acc 0.9375
2017-12-12T13:05:38.040642: step 4640, loss 0.0451296, acc 0.96875
2017-12-12T13:06:23.182460: step 4660, loss 0.0518525, acc 0.96875
2017-12-12T13:07:08.544435: step 4680, loss 0.0367042, acc 0.96875
2017-12-12T13:07:54.063378: step 4700, loss 0.117453, acc 0.96875

Evaluation:
2017-12-12T13:08:17.796303: step 4700, loss 0.0826418, acc 0.978667

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-4700

2017-12-12T13:09:04.253666: step 4720, loss 0.351754, acc 0.9375
2017-12-12T13:09:49.672141: step 4740, loss 0.0875857, acc 0.96875
2017-12-12T13:10:34.963035: step 4760, loss 0.0271093, acc 1
2017-12-12T13:11:20.120201: step 4780, loss 0.00830471, acc 1
2017-12-12T13:12:05.334681: step 4800, loss 0.00305103, acc 1

Evaluation:
2017-12-12T13:12:29.053127: step 4800, loss 0.0797222, acc 0.979778

Saved m

2017-12-12T14:15:05.671815: step 6300, loss 0.00165608, acc 1

Evaluation:
2017-12-12T14:15:29.350633: step 6300, loss 0.11378, acc 0.979778

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-6300

2017-12-12T14:16:15.926692: step 6320, loss 0.0829345, acc 0.96875
2017-12-12T14:17:01.232980: step 6340, loss 2.32561e-05, acc 1
2017-12-12T14:17:46.444902: step 6360, loss 0.00636644, acc 1
2017-12-12T14:18:31.647341: step 6380, loss 0.001327, acc 1
2017-12-12T14:19:17.035504: step 6400, loss 0.272757, acc 0.96875

Evaluation:
2017-12-12T14:19:40.781335: step 6400, loss 0.101085, acc 0.980333

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-6400

2017-12-12T14:20:27.402373: step 6420, loss 0.0728991, acc 0.9375
2017-12-12T14:21:12.857030: step 6440, loss 0.0768924, acc 0.96875
2017-12-12T14:21:57.962236: step 6460, loss 4.37738e-05, acc 1
2017-12-12T14:22:43.155774: step 6480, loss 0.00233367, acc 1
2017-12-12T14:23:28.7

2017-12-12T15:24:49.329952: step 7960, loss 0.155248, acc 0.96875
2017-12-12T15:25:34.413577: step 7980, loss 0.00340103, acc 1
2017-12-12T15:26:19.689577: step 8000, loss 0.00498521, acc 1

Evaluation:
2017-12-12T15:26:43.522927: step 8000, loss 0.0811574, acc 0.983333

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-8000

2017-12-12T15:27:29.625732: step 8020, loss 0.00138427, acc 1
2017-12-12T15:28:14.745066: step 8040, loss 0.00613927, acc 1
2017-12-12T15:29:00.134568: step 8060, loss 0.0126915, acc 1
2017-12-12T15:29:45.201820: step 8080, loss 0.578137, acc 0.90625
2017-12-12T15:30:30.427926: step 8100, loss 0.00108275, acc 1

Evaluation:
2017-12-12T15:30:54.036686: step 8100, loss 0.085702, acc 0.983556

Saved model checkpoint to /home/dmml1/git/textCNN/runs/1513043440/checkpoints/model-8100

2017-12-12T15:31:40.139261: step 8120, loss 0.000100688, acc 1
2017-12-12T15:32:25.458836: step 8140, loss 0.149451, acc 0.96875
2017-12-12T15:33:10.55882