# 词向量

## Step1: 导入需要的包

In [1]:
import paddle
import paddle.fluid as fluid
import six
import numpy as np
import math

## Step2: 定义神经网络

### 2.1 定义参数

In [2]:
# 设置embedding维度
EMBED_SIZE = 32

# 设置隐层大小
HIDDEN_SIZE = 256

# 设置ngram大小
# 表示在PaddlePaddle训练时，每条数据的前4个词用来预测第5个词。
N = 5

# 设置batch大小
BATCH_SIZE = 100

# 设置训练轮数
EPOCH_NUM = 100

In [3]:
# 设置是否使用GPU训练
use_cuda = False

- 更大的BATCH_SIZE将使得训练更快收敛，但也会消耗更多内存。
- 词向量计算规模较大，如果环境允许，开启使用GPU进行训练，能更快得到结果。
- PaddlePaddle提供了一个内置的方法fluid.embedding，可以直接用它来构造 N-gram 神经网络。

In [4]:
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)

- 这里使用Penn Treebank （PTB）（经Tomas Mikolov预处理过的版本）数据集。
- PTB数据集较小，训练速度快，应用于Mikolov的公开语言模型训练工具中
- PaddlePaddle提供了对应PTB数据集的python包`paddle.dataset.imikolov`，自动做数据的下载与预处理。
- 每个输入会按其单词次在字典里的位置，转化成整数的索引序列，作为PaddlePaddle的输入。

In [5]:
print(dict_size)

2074


In [6]:
word_dict[b'among']

212

In [7]:
word_dict[b'a']

7

In [8]:
word_dict[b'group']

97

In [9]:
word_dict[b'of']

5

In [10]:
word_dict

{b'the': 0,
 b'<unk>': 1,
 '<e>': 2,
 '<s>': 3,
 b'N': 4,
 b'of': 5,
 b'to': 6,
 b'a': 7,
 b'in': 8,
 b'and': 9,
 b"'s": 10,
 b'for': 11,
 b'that': 12,
 b'$': 13,
 b'is': 14,
 b'it': 15,
 b'said': 16,
 b'on': 17,
 b'at': 18,
 b'by': 19,
 b'as': 20,
 b'from': 21,
 b'million': 22,
 b'with': 23,
 b'mr.': 24,
 b'was': 25,
 b'be': 26,
 b'its': 27,
 b'are': 28,
 b'he': 29,
 b'but': 30,
 b'has': 31,
 b'an': 32,
 b"n't": 33,
 b'have': 34,
 b'will': 35,
 b'new': 36,
 b'or': 37,
 b'company': 38,
 b'they': 39,
 b'this': 40,
 b'which': 41,
 b'year': 42,
 b'would': 43,
 b'about': 44,
 b'market': 45,
 b'says': 46,
 b'more': 47,
 b'were': 48,
 b'had': 49,
 b'billion': 50,
 b'his': 51,
 b'their': 52,
 b'up': 53,
 b'one': 54,
 b'u.s.': 55,
 b'than': 56,
 b'stock': 57,
 b'who': 58,
 b'been': 59,
 b'some': 60,
 b'also': 61,
 b'other': 62,
 b'share': 63,
 b'not': 64,
 b'we': 65,
 b'corp.': 66,
 b'when': 67,
 b'if': 68,
 b'shares': 69,
 b'last': 70,
 b'all': 71,
 b'i': 72,
 b'president': 73,
 b'years': 74,

### 2.2 构建网络

- 构建N-gram 神经网络结构，这个结构在训练和预测中都会使用到。
- 因为词向量比较稀疏，传入参数 is_sparse == True, 可以加速稀疏矩阵的更新。

In [11]:
def predict_program(words, is_sparse):
    
    embed_first = fluid.embedding(
        input = words[0],
        size = [dict_size, EMBED_SIZE],
        dtype = 'float32',
        is_sparse= is_sparse,
        param_attr = 'shared_w')
    
    embed_second = fluid.embedding(
        input = words[1],
        size = [dict_size, EMBED_SIZE],
        dtype = 'float32',
        is_sparse = is_sparse,
        param_attr = 'shared_w')
    
    embed_third = fluid.embedding(
        input = words[2],
        size = [dict_size, EMBED_SIZE],
        dtype = 'float32',
        is_sparse = is_sparse,
        param_attr = 'shared_w')
    
    embed_fourth = fluid.embedding(
        input = words[3],
        size = [dict_size, EMBED_SIZE],
        dtype = 'float32',
        is_sparse = is_sparse,
        param_attr = 'shared_w')
    
    concat_embed = fluid.layers.concat(
        input = [embed_first, embed_second, embed_third, embed_fourth], axis=1)
    
    hidden_1 = fluid.layers.fc(input = concat_embed,
                                size = HIDDEN_SIZE,
                                act = 'sigmoid')
    
    predict_word = fluid.layers.fc(input=hidden_1, size = dict_size, act = 'softmax')
    return predict_word

## Step3: 模型训练

基于前面构建的神经网络结构，定义训练方法

In [12]:
def train_program(predict_word):
    # next_word 的定义必须要在 predict_program 的声明之后
    # 否则train program输入数据的顺序就变成了[next_word, firstw, secondw, thirdw, fourthw], 这是不正确的。
    next_word = fluid.data(name = 'nextw', shape = [None, 1], dtype = 'int64')
    cost = fluid.layers.cross_entropy(input = predict_word, label = next_word)
    avg_cost = fluid.layers.mean(cost)
    return avg_cost

In [13]:
def optimizer_func():
    return fluid.optimizer.AdagradOptimizer(
        learning_rate = 3e-3,
        regularization = fluid.regularizer.L2DecayRegularizer(8e-4))

- PaddlePaddle中有现成的训练和测试集：`paddle.dataset.imikolov.train()`和`paddle.dataset.imikolov.test()`。两者都会返回一个读取器。
- 在PaddlePaddle中，读取器是一个Python的函数，每次调用，会读取下一条数据。它是一个Python的generator。
- `paddle.batch` 会读入一个读取器，然后输出一个批次化了的读取器。

In [14]:
def train(if_use_cuda, params_dirname, is_sparse = True):
    place = fluid.CUDAPlace(0) if if_use_cuda else fluid.CPUPlace()
    
    train_reader = paddle.batch(
        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
    
    test_reader = paddle.batch(
        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
    
    first_word = fluid.data(name = 'firstw', shape = [None, 1], dtype = 'int64')
    second_word = fluid.data(name = 'secondw', shape = [None, 1], dtype = 'int64')
    third_word = fluid.data(name = 'thirdw', shape = [None, 1], dtype = 'int64')
    fourth_word = fluid.data(name = 'fourthw', shape = [None, 1], dtype = 'int64')
    next_word = fluid.data(name = 'nextw', shape = [None, 1], dtype = 'int64')
    
    word_list = [first_word, second_word, third_word, fourth_word, next_word]
    feed_order = ['firstw', 'secondw', 'thirdw', 'fourthw', 'nextw']
    
    main_program = fluid.default_main_program()
    start_program = fluid.default_startup_program()
    
    predict_word = predict_program(word_list, is_sparse)
    avg_cost = train_program(predict_word)
    test_program = main_program.clone(for_test = True)
    
    sgd_optimizer = optimizer_func()
    sgd_optimizer.minimize(avg_cost)
    
    exe = fluid.Executor(place)
    

    def train_test(program, reader):
        count = 0
        feed_var_list = [
            program.global_block().var(var_name) for var_name in feed_order
        ]

        feeder_test = fluid.DataFeeder(feed_list = feed_var_list, place = place)
        test_exe = fluid.Executor(place)
        accumulated = len([avg_cost]) * [0]

        for test_data in reader():
            avg_cost_np = test_exe.run(
                program = program,
                feed = feeder_test.feed(test_data),
                fetch_list = [avg_cost])

            accumulated = [
                x[0] + x[1][0] for x in zip(accumulated, avg_cost_np)
            ]

            count += 1

            return [x / count for x in accumulated]
    
    
    def train_loop():
        step = 0
        feed_var_list_loop = [
            main_program.global_block().var(var_name) for var_name in feed_order
        ]

        feeder = fluid.DataFeeder(feed_list = feed_var_list_loop, place = place)
        exe.run(start_program)

        for epoch_id in range(EPOCH_NUM):
            for data in train_reader():
                avg_cost_np = exe.run(
                    main_program, feed = feeder.feed(data), fetch_list = [avg_cost])

                if step % 10 == 0:
                    outs = train_test(test_program, test_reader)

                    print("Step %d: , Average Cost: %f" % (step, outs[0]))

                    if outs[0] < 5.8:
                        if params_dirname is not None:
                            fluid.io.save_inference_model(params_dirname,
                                                         ['firstw', 'secondw', 'thirdw', 'fourthw'],
                                                         [predict_word],
                                                         exe)
                        return    
                step += 1
                if math.isnan(float(avg_cost_np[0])):
                    sys.exit("Got NaN loss, training failed.")

        raise AssertionError("Cost is too large {0: 2.2}".format(avg_cost_np[0]))
        
    train_loop()

In [15]:
# train(use_cuda, "word2vec.inference.model")

## Step4: 模型应用

用之前训练过的模型，在得知之前的 N-gram 后，预测下一个词。

In [16]:
def predict(use_cuda, params_dirname=None):
    place = fluid.CUADPlace(0) if use_cuda else fluid.CPUPlace()
    
    exe = fluid.Executor(place)
    
    prediction_scope = fluid.core.Scope()
    
    with fluid.scope_guard(prediction_scope):
        # 使用fluid.io.load_inference_model获取predict program，
        # feed变量的名称feed_target_names和从scope中fetch的对象fetch_targets
        [predictor, feed_target_names, fetch_targets] = fluid.io.load_inference_model(params_dirname, exe)
        
        # 设置输入，用四个LoDTensor来表示4个词语。这里每个词都是一个id，
        # 用来查询embedding表获取对应的词向量，因此其形状大小是[1]。
        # recursive_sequence_lengths设置的是基于长度的LoD，因此都应该设为[[1]]
        # 注意recursive_sequence_lengths是列表的列表
        data_1 = np.asarray([[212]], dtype = np.int64)  # 'among'
        data_2 = np.asarray([[7]],dtype = np.int64)  # 'a'
        data_3 = np.asarray([[97]], dtype = np.int64)  # 'group'
        data_4 = np.asarray([[5]], dtype = np.int64)  # 'of'
        lod = np.asarray([[1]], dtype = np.int64)
        
        first_word = fluid.create_lod_tensor(data_1, lod, place)
        second_word = fluid.create_lod_tensor(data_2, lod, place)
        third_word = fluid.create_lod_tensor(data_3, lod, place)
        fourth_word = fluid.create_lod_tensor(data_4, lod, place)
        
        assert feed_target_names[0] == 'firstw'
        assert feed_target_names[1] == 'secondw'
        assert feed_target_names[2] == 'thirdw'
        assert feed_target_names[3] == 'fourthw'
        
        
        # 构造feed词典 {feed_target_name: feed_target_data}
        # 预测结果包含在results之中
        results = exe.run(
            predictor,
            feed = {
                feed_target_names[0]: first_word,
                feed_target_names[1]: second_word,
                feed_target_names[2]: third_word,
                feed_target_names[3]: fourth_word
            },
            fetch_list = fetch_targets,
            return_numpy = False)
        
        print()
        print(np.array(results[0]))
        
        most_possible_word_index = np.argmax(results[0])
        print(most_possible_word_index)
        
        print([str(key,encoding='utf-8') for key, value in six.iteritems(word_dict) 
               if value == most_possible_word_index][0])

整个程序的入口

In [17]:
def main(use_cuda, is_sparse):
    if use_cuda and not fluid.core.is_compiled_with_cuda():
        return
    
    params_dirname = 'word2vec.inference.model'
    
    train(
        if_use_cuda=use_cuda,
        params_dirname=params_dirname,
        is_sparse=is_sparse)
    
    predict(use_cuda=use_cuda, params_dirname=params_dirname)

In [18]:
main(use_cuda=use_cuda, is_sparse=True)

Step 0: , Average Cost: 7.231980
Step 10: , Average Cost: 5.767432

[[0.01591288 0.01158787 0.00956775 ... 0.00028631 0.00035623 0.01320103]]
0
the


其中第一行表示预测词在词典上的概率分布，第二行表示概率最大的词对应的id，第三行表示概率最大的词。