# fastText

In [1]:
import tensorflow as tf
import numpy as np
import random
import math
import csv
import collections

In [2]:
def load_data(data_path):
    """
    载入原始数据
    """
    data= []
    labels = []
    csv_reader = csv.reader(open(data_path, encoding='utf-8'))
    max_sentence_len = 0
    for line in csv_reader:
        one_data = line[1].split()
        one_data.extend(line[2].split())
        if len(one_data) > max_sentence_len:
            max_sentence_len = len(one_data)
        data.append(one_data)
        labels.append(int(line[0]))
    print("max sentence length: ", max_sentence_len)
    return data, labels

train_data, train_labels = load_data("./data/ag_news_csv/train.csv")
test_data, test_labels = load_data("./data/ag_news_csv/test.csv")
print(train_data[:3])
print(train_labels[:3])
print("Length of train data: ", len(train_data))
print("Length of test data: ", len(test_data))

max sentence length:  177
max sentence length:  137
[['Wall', 'St.', 'Bears', 'Claw', 'Back', 'Into', 'the', 'Black', '(Reuters)', 'Reuters', '-', 'Short-sellers,', 'Wall', "Street's", 'dwindling\\band', 'of', 'ultra-cynics,', 'are', 'seeing', 'green', 'again.'], ['Carlyle', 'Looks', 'Toward', 'Commercial', 'Aerospace', '(Reuters)', 'Reuters', '-', 'Private', 'investment', 'firm', 'Carlyle', 'Group,\\which', 'has', 'a', 'reputation', 'for', 'making', 'well-timed', 'and', 'occasionally\\controversial', 'plays', 'in', 'the', 'defense', 'industry,', 'has', 'quietly', 'placed\\its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market.'], ['Oil', 'and', 'Economy', 'Cloud', "Stocks'", 'Outlook', '(Reuters)', 'Reuters', '-', 'Soaring', 'crude', 'prices', 'plus', 'worries\\about', 'the', 'economy', 'and', 'the', 'outlook', 'for', 'earnings', 'are', 'expected', 'to\\hang', 'over', 'the', 'stock', 'market', 'next', 'week', 'during', 'the', 'depth', 'of', 'the\\summer', 'doldrums.']]
[3, 3, 3]


In [3]:
def build_voabulary(train_data, test_data, vocabulary_size=50000):
    """
    基于所有数据构建词表
    """
    count = [['UNK', -1]]
    words = []
    for line in train_data:
        words.extend(line)
    for line in test_data:
        words.extend(line)
    count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
    dict_word2index = dict()
    for word, _ in count:
        dict_word2index[word] = len(dict_word2index)
    dict_index2word = dict(zip(dict_word2index.values(), dict_word2index.keys()))
    
    return  count, dict_word2index, dict_index2word

count, dict_word2index, dict_index2word = build_voabulary(train_data, test_data, vocabulary_size=100000)
print("Most 10 common words: ", count[:10])

Most 10 common words:  [['UNK', -1], ('the', 188763), ('to', 125072), ('a', 104353), ('of', 103519), ('in', 97687), ('and', 72345), ('on', 58606), ('for', 51494), ('-', 41545)]


In [4]:
def build_dataset(data, labels, dict_word2index, max_sentence_len=200, label_size=4):
    """
    基于词表构建数据集（数值化）
    """
    dataset = []
    indices = np.arange(len(labels))
    np.random.shuffle(indices)
    new_labels = []
    for i in indices:
        one_label = [0] * label_size
        one_label[labels[i]-1] = 1
        new_labels.append(one_label) 
        new_line = []
        for word in data[i]:
            if word in dict_word2index:
                index = dict_word2index[word]
            else:
                index = 0    # UNK
            new_line.append(index)
        
        zero_num = max_sentence_len - len(new_line)
        while zero_num > 0:
            new_line.append(0)
            zero_num -= 1
        dataset.append(new_line)
    return np.array(dataset, dtype=np.int32), np.array(new_labels, dtype=np.int32)

train_dataset, train_labels = build_dataset(train_data, train_labels, dict_word2index)
test_dataset, test_labels = build_dataset(test_data, test_labels, dict_word2index)

print(train_dataset.shape)
print(train_labels.shape)

(120000, 200)
(120000, 4)


In [5]:
def split_data(data, radio=0.7):
    """
    将训练集分给为训练集和检验集
    """
    split_index = int(len(data) * 0.7)
    new_data1 = data[ : split_index]
    new_data2 = data[split_index : ]
    return new_data1, new_data2

train_X, valid_X = split_data(train_dataset)
train_y, valid_y = split_data(train_labels)
print(train_X.shape)
print(train_y.shape)

(84000, 200)
(84000, 4)


In [6]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

### 下面开始构建计算图并进行训练，通过实验发现：
- 将num_steps从1001提高到2001，对模型效果有显著提升
- 在该数据集下，将vocabulary_size分别设置为50000和100000对检验集上的accuracy影响不大

In [7]:
batch_size = 16
embedding_size = 128 # Dimension of the embedding vector.

# num_sampled = 4 # Number of negative examples to sample.
max_sentence_len = 200
vocabulary_size = 100000
label_size = 4
graph = tf.Graph()
    
with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  tf_train_dataset = tf.placeholder(tf.int32, shape=[batch_size, max_sentence_len])
  tf_train_labels = tf.placeholder(tf.int32, shape=[batch_size,label_size])
  tf_valid_dataset = tf.constant(valid_X, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([label_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([label_size]))
  
  def model(data): 
    # Model.
    # Look up embeddings for inputs.
    # embedding_lookup()用法: http://blog.csdn.net/u013041398/article/details/60955847
    embed = tf.nn.embedding_lookup(embeddings, data)
    # Compute the softmax loss, using a sample of the negative labels each time.
    sentence_embed = tf.reduce_mean(embed, axis=1)
    
    return tf.matmul(sentence_embed, tf.transpose(softmax_weights)) + softmax_biases


  logits = model(tf_train_dataset)
    
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
  """
  由于label_size=4，比较小，不需要进行Hierarchical softmax
  loss = tf.reduce_mean( #inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of the input network.
                tf.nn.nce_loss(weights=softmax_weights,  #[embed_size, label_size]--->[label_size,embed_size]. nce_weights:A `Tensor` of shape `[num_classes, dim].O.K.
                               biases=softmax_biases,                 #[label_size]. nce_biases:A `Tensor` of shape `[num_classes]`.
                               labels=train_labels,                 #[batch_size,1]. train_labels, # A `Tensor` of type `int64` and shape `[batch_size,num_true]`. The target classes.
                               inputs=sentence_embed,# [None,self.embed_size] #A `Tensor` of shape `[batch_size, dim]`.  The forward activations of the input network.
                               num_sampled=num_sampled,  #scalar. 100
                               num_classes=label_size,partition_strategy="div"))
  """   
  # Optimizer.
  global_step = tf.Variable(0, trainable=False)
  learning_rate = tf.train.exponential_decay(0.01, global_step, 1000, 0.9, staircase=True)
  optimizer = tf.contrib.layers.optimize_loss(loss, global_step=global_step,learning_rate=learning_rate, optimizer="Adam")

  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
#   test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [8]:
num_steps = 2001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()

  for step in range(num_steps):
    offset = (step * batch_size) % (train_y.shape[0] - batch_size)
    batch_data = train_X[offset:(offset + batch_size)]
    batch_labels = train_y[offset:(offset + batch_size)]

    feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
    _, l,tp = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    #print("loss:",l,"acc:",a,"label:",batch_labels,"prediction:",p)
    if (step % 100 == 0):
        train_acc = accuracy(tp, batch_labels)
        print("loss:", l, "train accuracy:", train_acc)
        if step > num_steps * 0.7:
            print( "valid accuracy:",accuracy(valid_prediction.eval(), valid_y))

loss: 1.41933 train accuracy: 25.0
loss: 1.34674 train accuracy: 37.5
loss: 1.08823 train accuracy: 62.5
loss: 0.557154 train accuracy: 75.0
loss: 0.476505 train accuracy: 87.5
loss: 0.476347 train accuracy: 75.0
loss: 0.410188 train accuracy: 87.5
loss: 0.553505 train accuracy: 75.0
loss: 0.3854 train accuracy: 93.75
loss: 0.210102 train accuracy: 93.75
loss: 0.105567 train accuracy: 100.0
loss: 0.338261 train accuracy: 81.25
loss: 0.393003 train accuracy: 81.25
loss: 0.253228 train accuracy: 93.75
loss: 0.26827 train accuracy: 87.5
loss: 0.544425 train accuracy: 93.75
valid accuracy: 88.6194444444
loss: 0.185084 train accuracy: 93.75
valid accuracy: 87.0861111111
loss: 0.15021 train accuracy: 93.75
valid accuracy: 90.1055555556
loss: 0.35806 train accuracy: 93.75
valid accuracy: 88.3388888889
loss: 0.51087 train accuracy: 81.25
valid accuracy: 90.2694444444
loss: 0.266149 train accuracy: 87.5
valid accuracy: 90.5388888889


### 下面尝试使用L2 loss对模型进行regularization
- 当同时惩罚参数embeddings和softmax_weights时，即使regularization_rate设置很小0.001，训练结果仍不理想。
    - 猜想：应该是embeddings维度过高，导致惩罚过大
- 若只惩罚softmax_weights，当regularization_rate=0.001时，效果与未正则化类似。
    - 猜想：训练数据并未过拟合，由于batch_size*num_steps=8*2001=16008，只为训练集规模的两倍左右
- 只惩罚softmax_weights，当regularization_rate=0.001，batch_size=64，其余不变，效果提升（0.897到0.915)
    - 猜想：效果提升可能只与提高batch_size有关，或者也与正则化有关

In [9]:
batch_size = 64
embedding_size = 128 # Dimension of the embedding vector.

# num_sampled = 4 # Number of negative examples to sample.
max_sentence_len = 200
vocabulary_size = 100000
label_size = 4
regularization_rate = 0.001
graph = tf.Graph()
    
with graph.as_default(), tf.device('/cpu:0'):

  # Input data.
  tf_train_dataset = tf.placeholder(tf.int32, shape=[batch_size, max_sentence_len])
  tf_train_labels = tf.placeholder(tf.int32, shape=[batch_size,label_size])
  tf_valid_dataset = tf.constant(valid_X, dtype=tf.int32)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  softmax_weights = tf.Variable(
    tf.truncated_normal([label_size, embedding_size],
                         stddev=1.0 / math.sqrt(embedding_size)))
  softmax_biases = tf.Variable(tf.zeros([label_size]))
  
  def model(data): 
    # Model.
    # Look up embeddings for inputs.
    # embedding_lookup()用法: http://blog.csdn.net/u013041398/article/details/60955847
    embed = tf.nn.embedding_lookup(embeddings, data)
    # Compute the softmax loss, using a sample of the negative labels each time.
    sentence_embed = tf.reduce_mean(embed, axis=1)
    
    return tf.matmul(sentence_embed, tf.transpose(softmax_weights)) + softmax_biases

  logits = model(tf_train_dataset)
  l2_loss = tf.nn.l2_loss(softmax_weights)
  loss = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits)) \
    + regularization_rate * l2_loss

  # Optimizer.
  global_step = tf.Variable(0, trainable=False)
  learning_rate = tf.train.exponential_decay(0.01, global_step, 1000, 0.9, staircase=True)
  optimizer = tf.contrib.layers.optimize_loss(loss, global_step=global_step,learning_rate=learning_rate, optimizer="Adam")

  train_prediction = tf.nn.softmax(logits)
  valid_prediction = tf.nn.softmax(model(tf_valid_dataset))
#   test_prediction = tf.nn.softmax(model(tf_test_dataset))

In [10]:
num_steps = 2001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()

  for step in range(num_steps):
    offset = (step * batch_size) % (train_y.shape[0] - batch_size)
    batch_data = train_X[offset:(offset + batch_size)]
    batch_labels = train_y[offset:(offset + batch_size)]

    feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels}
    _, l,tp = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
    #print("loss:",l,"acc:",a,"label:",batch_labels,"prediction:",p)
    if (step % 100 == 0):
        train_acc = accuracy(tp, batch_labels)
        print("loss:", l, "train accuracy:", train_acc)
        if step > num_steps * 0.7:
            print( "valid accuracy:",accuracy(valid_prediction.eval(), valid_y))

loss: 1.47381 train accuracy: 23.4375
loss: 1.11784 train accuracy: 59.375
loss: 0.680266 train accuracy: 78.125
loss: 0.588923 train accuracy: 84.375
loss: 0.417439 train accuracy: 85.9375
loss: 0.47924 train accuracy: 89.0625
loss: 0.227059 train accuracy: 98.4375
loss: 0.455392 train accuracy: 82.8125
loss: 0.37587 train accuracy: 89.0625
loss: 0.327902 train accuracy: 90.625
loss: 0.286586 train accuracy: 95.3125
loss: 0.2923 train accuracy: 92.1875
loss: 0.291932 train accuracy: 90.625
loss: 0.289714 train accuracy: 93.75
loss: 0.216585 train accuracy: 96.875
loss: 0.279919 train accuracy: 93.75
valid accuracy: 91.5027777778
loss: 0.236183 train accuracy: 93.75
valid accuracy: 91.7083333333
loss: 0.250211 train accuracy: 95.3125
valid accuracy: 91.3583333333
loss: 0.325249 train accuracy: 87.5
valid accuracy: 91.6777777778
loss: 0.243435 train accuracy: 95.3125
valid accuracy: 91.3388888889
loss: 0.242334 train accuracy: 96.875
valid accuracy: 91.5277777778
