In [1]:
import tensorflow as tf
import numpy as np
import random
import math
import collections

In [26]:
def load_data(data_path):
    """
    载入数据
    """
    data= []
    labels = []
    max_sentence_len = 0
    with open(data_path, 'r') as f:
        for line in f:
            line_list = line.split('\t')
            one_data = line_list[1].split(' ')
            tmp_len = len(one_data)
            if tmp_len > max_sentence_len:
                max_sentence_len = tmp_len
            data.append(one_data)
            labels.append(int(line_list[2]))
        f.close()
    print("max sentence length: ", max_sentence_len)
    return data, labels

data_path = '../data/seg_sample_train.txt'
data, labels = load_data(data_path)

max sentence length:  20420


In [6]:
from itertools import groupby

def show_text_len_distribution(data):
    len_list = [len(text) for text in data]
#     print(len_list[1:100])
    step = 500
    for k, g in groupby(sorted(len_list), key=lambda x: (x-1)//step):
    #    dic['{}-{}'.format(k*step+1, (k+1)*step)] = len(list(g))
        print('{}-{}'.format(k*step+1, (k+1)*step)+":"+str(len(list(g))))
show_text_len_distribution(data)

1-500:6015
501-1000:2406
1001-1500:647
1501-2000:303
2001-2500:206
2501-3000:125
3001-3500:70
3501-4000:55
4001-4500:37
4501-5000:33
5001-5500:19
5501-6000:17
6001-6500:9
6501-7000:7
7001-7500:5
7501-8000:6
8001-8500:7
8501-9000:4
9001-9500:7
9501-10000:5
10501-11000:2
11501-12000:1
12001-12500:2
12501-13000:3
14501-15000:3
15001-15500:1
16001-16500:1
17001-17500:1
17501-18000:2
20001-20500:1


In [22]:
def build_voabulary(data, vocabulary_size=50000):
    """
    基于所有数据构建词表
    """
    count = [['UNK', -1]]
    words = []
    for line in data:
        words.extend(line)
    for line in data:
        words.extend(line)
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dict_word2index = dict()
    for word, _ in count:
        dict_word2index[word] = len(dict_word2index)
    dict_index2word = dict(zip(dict_word2index.values(), dict_word2index.keys()))
    
    return  count, dict_word2index, dict_index2word

count, dict_word2index, dict_index2word = build_voabulary(data, vocabulary_size=100000)

In [10]:
print("Most 10 common words: ", count[:20])

Most 10 common words:  [['UNK', -1], ('，', 1035242), ('。', 478414), ('、', 428540), ('的', 356810), ('被告人', 279828), ('月', 222592), ('年', 206542), ('某某', 190734), ('日', 156288), ('在', 112568), ('2014', 111140), ('了', 104408), ('被', 102574), ('元', 100762), ('）', 84050), ('（', 84018), ('人民检察院', 72434), ('于', 72304), ('公诉', 67326)]


In [45]:
def build_dataset(data, labels, dict_word2index, max_sentence_len=1000, label_size=8):
    """
    基于词表构建数据集（数值化）
    """
    dataset = []
    indices = np.arange(len(labels))
    np.random.shuffle(indices)
    new_labels = []
    for i in indices:
        one_label = [0] * label_size
        one_label[labels[i]-1] = 1
        new_labels.append(one_label) 
        new_line = []
        for word in data[i]:
            if word in dict_word2index:
                index = dict_word2index[word]
            else:
                index = 0    # UNK
            new_line.append(index)
        
        zero_num = max_sentence_len - len(new_line)
        while zero_num > 0:
            new_line.append(0)
            zero_num -= 1
        dataset.append(new_line[:max_sentence_len])
#     return dataset, new_labels
    return np.array(dataset, dtype=np.int32), np.array(new_labels, dtype=np.int32)

train_data, train_labels = build_dataset(data, labels, dict_word2index, max_sentence_len=1000)

In [40]:
print(len(train_data[4]))
print(train_labels[4])

2000
[0 0 0 0 0 0 1 0]


In [46]:
def split_data(data, radio=0.7):
    """
    将训练集分给为训练集和检验集
    """
    split_index = int(len(data) * 0.7)
    new_data1 = data[ : split_index]
    new_data2 = data[split_index : ]
    return new_data1, new_data2

train_X, valid_X = split_data(train_data)
train_y, valid_y = split_data(train_labels)
print(train_X.shape)
print(train_y.shape)

(7000, 1000)
(7000, 8)


In [42]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

In [47]:
batch_size = 16
embedding_size = 300 # Dimension of the embedding vector.

# num_sampled = 4 # Number of negative examples to sample.
max_sentence_len = 1000
vocabulary_size = 100000
label_size = 8
graph = tf.Graph()
    
with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  tf_train_dataset = tf.placeholder(tf.int32, shape=[batch_size, max_sentence_len])
  tf_train_labels = tf.placeholder(tf.int32, shape=[batch_size,label_size])
  tf_valid_dataset = tf.constant(valid_X, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([label_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([label_size]))
  
  def model(data): 
    # Model.
    # Look up embeddings for inputs.
    # embedding_lookup()用法: http://blog.csdn.net/u013041398/article/details/60955847
    embed = tf.nn.embedding_lookup(embeddings, data)
    # Compute the softmax loss, using a sample of the negative labels each time.
    sentence_embed = tf.reduce_mean(embed, axis=1)
    
    return tf.matmul(sentence_embed, tf.transpose(softmax_weights)) + softmax_biases


  logits = model(tf_train_dataset)
    
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))

  # Optimizer.
  global_step = tf.Variable(0, trainable=False)
  learning_rate = tf.train.exponential_decay(0.01, global_step, 1000, 0.9, staircase=True)
  optimizer = tf.contrib.layers.optimize_loss(loss, global_step=global_step,learning_rate=learning_rate, optimizer="Adam")

  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
#   test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [48]:
num_steps = 2001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()

  for step in range(num_steps):
    offset = (step * batch_size) % (train_y.shape[0] - batch_size)
    batch_data = train_X[offset:(offset + batch_size)]
    batch_labels = train_y[offset:(offset + batch_size)]

    feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
    _, l,tp = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    #print("loss:",l,"acc:",a,"label:",batch_labels,"prediction:",p)
    if (step % 100 == 0):
        train_acc = accuracy(tp, batch_labels)
        print("loss:", l, "train accuracy:", train_acc)
        if step > num_steps * 0.7:
            print( "valid accuracy:",accuracy(valid_prediction.eval(), valid_y))

loss: 2.14567 train accuracy: 0.0
loss: 1.86938 train accuracy: 25.0
loss: 1.4429 train accuracy: 56.25
loss: 1.83364 train accuracy: 37.5
loss: 1.55431 train accuracy: 37.5
loss: 1.69195 train accuracy: 31.25
loss: 1.68388 train accuracy: 37.5
loss: 1.0841 train accuracy: 43.75
loss: 1.307 train accuracy: 50.0
loss: 0.893233 train accuracy: 56.25
loss: 0.795051 train accuracy: 81.25
loss: 0.812872 train accuracy: 75.0
loss: 0.565707 train accuracy: 81.25
loss: 0.473443 train accuracy: 87.5
loss: 0.498563 train accuracy: 81.25
loss: 0.424316 train accuracy: 87.5
valid accuracy: 37.9333333333
loss: 0.239726 train accuracy: 93.75
valid accuracy: 39.2
loss: 0.212022 train accuracy: 100.0
valid accuracy: 37.9333333333
loss: 0.266965 train accuracy: 93.75
valid accuracy: 37.0333333333
loss: 0.157774 train accuracy: 100.0
valid accuracy: 38.2
loss: 0.219489 train accuracy: 93.75
valid accuracy: 37.6


## 经验总结(仅仅针对采样数据，由于量较少，可能不适用甚至相反)
- 适当增加迭代次数可以提高准确度 （1001 -> 2001)
- max_sentence_len: 1000 -> 2000 , acc: 40.3% -> 39.8%
- embedding_size: 128 -> 300, acc: 40.3% -> 37%  特征表示变强，训练集准确度变高，但由于数据集过少，过拟合了