In [1]:
import numpy as np
import tensorflow as tf

# 交叉熵计算

In [5]:
word_labels = tf.constant([2, 0])
predict_logits = tf.constant([[2.0, -1.0, 3.0], [1.0, 0.0, -0.5]])

loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=word_labels,
                                                      logits=predict_logits)

with tf.Session() as sess:
    print(sess.run(loss))

[0.32656264 0.4643688 ]


In [7]:
tf.reset_default_graph()

word_prob_distribution = tf.constant([[0.0, 0.0, 1.0], [1.0, 0.0, 0.0]])
predict_logits = tf.constant([[2.0, -1.0, 3.0], [1.0, 0.0, -0.5]])

loss = tf.nn.softmax_cross_entropy_with_logits(labels=word_prob_distribution,
                                               logits=predict_logits)

with tf.Session() as sess:
    print(sess.run(loss))

[0.32656264 0.4643688 ]


# PTB数据集预处理

In [2]:
import codecs
import collections
from operator import itemgetter

RAW_DATA = "../dataset/simple-examples/data/ptb.train.txt"
VOCAB_OUTPUT = "ptb.vocab"

counter = collections.Counter() #统计单词出现的频率
with codecs.open(RAW_DATA, "r", "utf-8") as f:
    for line in f:
        for word in line.strip().split():
            counter[word] += 1

In [3]:
counter

Counter({'creativity': 11,
         'summary': 7,
         'computer': 420,
         '26-week': 5,
         'pilson': 30,
         'medium': 8,
         'hispanics': 7,
         'inside': 43,
         'mcdonough': 18,
         'complains': 13,
         'labor-management': 27,
         'singled': 8,
         'utah': 20,
         'lynn': 7,
         'arrogant': 6,
         'distorted': 6,
         'crush': 12,
         'die': 20,
         'counterparts': 16,
         'shoot': 6,
         'fda': 26,
         'lesson': 13,
         'phoenix': 29,
         'pit': 11,
         'preclude': 6,
         'helpful': 9,
         'lend': 20,
         'u.s.a': 12,
         'illegal': 45,
         'median': 8,
         'powerhouse': 7,
         'fanfare': 6,
         'insurance': 401,
         'killer': 6,
         'regarded': 20,
         'bolster': 25,
         'temple': 15,
         'exception': 16,
         'lawsuits': 44,
         'las': 32,
         'designs': 17,
         'news': 333,
        

In [4]:
#按词频对单词进行排序
sorted_word_to_cnt = sorted(counter.items(),
                            key=itemgetter(1),
                            reverse=True)
sorted_words = [x[0] for x in sorted_word_to_cnt] #词汇表

In [5]:
#将句子结束符"<eos>"加入词汇表
sorted_words = ["<eos>"] + sorted_words

with codecs.open(VOCAB_OUTPUT, "w", "utf-8") as file_output:  #将词汇表保存在一个文件中
    for word in sorted_words:
        file_output.write(word + "\n")

In [6]:
# 将训练文件、测试文件根据词汇文件转换为单词编号
VOCAB = "ptb.vocab"
OUTPUT_DATA = "ptb.train"

#读取词汇表
with codecs.open(VOCAB, "r", "utf-8") as f_vocab:
    vocab = [w.strip() for w in f_vocab.readlines()]
    
word_to_id = {k: v for (k, v) in zip(vocab, range(len(vocab)))}

In [7]:
#如果出现了被删除的低频词，则替换为"<unk>"
def get_id(word):
    return word_to_id[word] if word in word_to_id else word_to_id["<unk>"]

In [10]:
OUTPUT_DATA = "ptb.test"
RAW_DATA = "../dataset/simple-examples/data/ptb.test.txt"
fin = codecs.open(RAW_DATA, "r", "utf-8")
fout = codecs.open(OUTPUT_DATA, "w", "utf-8")
for line in fin:
    words = line.strip().split() + ["<eos>"] #读取单词并添加<eos>结束符
    out_line = ' '.join([str(get_id(w)) for w in words]) + '\n'
    fout.write(out_line)
    
fin.close()
fout.close()

In [25]:
with codecs.open("ptb.train", "r", "utf-8") as f:
    i = 0
    for line in f:
        i += 1
        print(line)
        if i == 10:
            break

9999 9994 9985 9973 9987 9992 9983 9979 9977 9970 9989 9986 9991 9974 9993 9997 9988 9972 9980 9975 9996 9981 9976 9998 0

9273 2 3 72 393 33 2123 1 146 19 6 9143 275 408 3 0

23 2 13 142 4 2 5340 1 3136 1591 96 0

7331 2 3 72 393 8 337 142 4 2500 659 2191 955 24 522 6 9143 275 4 39 303 438 3675 0

6 944 4 3214 498 263 5 137 5969 4227 6135 30 995 6 240 757 4 1014 2770 212 6 96 4 427 4097 5 14 45 55 3 72 195 1236 220 0

1 3214 7519 2 13 4052 2 498 14 6915 1 2 22 113 2674 8376 5 14 2503 5245 10 463 52 3029 466 1236 15 0

2 80 1 167 4 35 2613 2 65 10 559 5969 3631 1891 665 2 7 27 2 4227 6135 7 3 0

367 1960 3205 46 220 45 55 6 40 195 1 467 342 1298 7 325 9 35 1503 919 4 3193 6 8658 371 5 1156 35 1415 5 1 433 0

6 2 2 15 39 13 31 393 1364 0

64 277 1922 43 72 195 157 1451 2371 4 3214 718 106 5754 1306 0



In [11]:
with codecs.open("ptb.valid", "r", "utf-8") as f:
    i = 0
    for line in f:
        i += 1
        print(line)
        if i == 10:
            break

1132 93 358 5 329 51 9591 6 326 2490 5 1 661 384 0

2 2 2937 2195 9 382 1071 2334 89 99 843 198 2 11 1 3396 1126 7 3 72 20 211 346 36 258 2 2 0

75 421 195 3911 4 249 1805 2 579 3528 892 2420 6 3 296 11 2739 16 1185 2 250 0

8 1 35 9917 3715 463 710 2992 2039 3911 135 6051 11 494 5967 16 1 130 272 9 463 0

9959 731 503 30 640 6 35 6715 7 2 8 759 9958 26 6601 5 6333 1 6455 0

1414 3911 93 1553 2 22 1 503 8 2 1 361 0

29 383 99 9958 26 7367 10 3911 56 26 3277 9234 52 6 879 4 323 93 335 118 51 2 350 2 8 1335 2 0

64 578 58 508 6 580 4 103 7 641 747 1900 5 661 359 108 44 5327 5968 71 4 791 9959 41 7464 503 11 179 2195 1257 8 1805 9 579 1496 0

22 1 9805 4 1 759 47 144 171 1381 13 735 11 6 228 5 188 3911 45 9637 0

54 4 1 818 1123 1 2426 269 4 3 1621 13 791 9959 6 795 817 4 2187 140 1017 95 8 140 731 82 3078 570 0



In [12]:
with codecs.open("ptb.test", "r", "utf-8") as f:
    i = 0
    for line in f:
        i += 1
        print(line)
        if i == 10:
            break

102 14 24 32 753 382 0

29 120 1 35 91 60 111 143 32 616 3205 282 19 1 447 458 437 196 1626 3 394 90 4 14 7 1 1113 1465 14 3163 1852 5 1335 39 1079 4 7223 0

57 2192 4914 3858 78 1 522 3 1034 777 51 74 898 278 117 2274 5 4102 1 399 3834 7 179 149 8 287 0

1 3 60 2579 365 16 1 129 146 1023 1 847 8 2888 4 69 3144 56 46 3043 78 1 3 1034 498 554 79 32 2432 1 399 842 0

129 145 248 2064 5 1204 52 5 1 5462 5 414 1 8748 1023 278 17 362 129 4233 4 60 278 117 0

773 399 4 738 82 886 9 3992 216 287 7 482 2 2808 149 3026 0

915 129 146 149 407 3037 3622 6320 3965 180 7839 1669 1982 8 687 5706 96 1898 77 8 574 5698 0

1 2 30 293 2362 0

1 620 47 24 2 0

498 554 1 2184 46 63 585 5 2432 1 6484 16 1 1023 4 1 35 91 60 111 15 2991 2 444 231 71 18 2 125 584 0



# PTB数据的batching方法

In [26]:
TRAIN_DATA = "ptb.train"
TRAIN_BATCH_SIZE = 20
TRAIN_NUM_STEP = 35

def read_data(file_path):
    with open(file_path, "r") as fin:
        #将整个文档读进一个长字符串
        id_string = ' '.join([line.strip() for line in fin.readlines()])
    id_list = [int(w) for w in id_string.split()]
    return id_list

def make_batches(id_list, batch_size, num_step):
    #计算总的batch数量 每个batch包含的单词数量是batch_size * num_step
    num_batches = (len(id_list) - 1) // (batch_size * num_step)
    
    #将数据整理成一个维度为[batch_size, num_batches * num_step]的二维数组
    data = np.array(id_list[: num_batches * batch_size * num_step])
    data = np.reshape(data, [batch_size, num_batches * num_step])
    #沿着第二个维度将数据切分成num_batches个batch,存入一个数组
    data_batches = np.split(data, num_batches, axis=1)
    
    #重复上述操作，但是每个位置向右移动一位，这里得到的是RNN每一步输出所需要的下一个单词
    label = np.array(id_list[1: num_batches * batch_size * num_step + 1])
    label = np.reshape(label, [batch_size, num_batches * num_step])
    label_batches = np.split(label, num_batches, axis=1)
    
    #返回一个长度为num_batches的数组，其中每一项包括一个data矩阵和一个label矩阵
    return list(zip(data_batches, label_batches))

In [27]:
train_batches = make_batches(read_data(TRAIN_DATA),
                             TRAIN_BATCH_SIZE, TRAIN_NUM_STEP)

In [30]:
train_batches[:1]

[(array([[9999, 9994, 9985, 9973, 9987, 9992, 9983, 9979, 9977, 9970, 9989,
          9986, 9991, 9974, 9993, 9997, 9988, 9972, 9980, 9975, 9996, 9981,
          9976, 9998,    0, 9273,    2,    3,   72,  393,   33, 2123,    1,
           146,   19],
         [  13, 1511,   18, 1441,    1,  846,  234,    1, 1380,    5, 1281,
             7, 1643, 1082, 3870,   17,  380, 1352,    4,  207,    0,    1,
          2610,    4,    1,  261,   13,    5,  335,    1,    2,   16,  764,
          1502,   10],
         [   1, 1122,  644,   46,   20,    2, 1060,   82, 1092,  473,    6,
          1916,    7,    2,    2,    8, 7588,   80,    6,    2, 2130,    7,
          1933,    0, 5717,   82, 9028,  559,  549,    2,   22, 8662,    8,
           537,    2],
         [ 387,   14,   93,   25, 1019,    5,  254,  170,   10,  207,    0,
            54, 1453, 1260,   22, 1661,   15,    1,  468,   42,   45,   55,
          1846,    1,   37,    9,  207,    4,  513,   12,    3,   48,    0,
            14,   5

# 嵌入矩阵

In [None]:
embedding = tf.get_variable("embedding", [VOCAB_SIZE, EMB_SIZE])
"""
tf.nn.embedding_lookup函数的用法主要是选取一个张量里面索引对应的元素。
tf.nn.embedding_lookup（tensor, id）:tensor就是输入张量，id就是张量对应的索引，
"""
input_embedding = tf.nn.embedding_lookup(embedding, input_data)

# 语言模型

In [17]:
TRAIN_DATA = "ptb.train"          # 训练数据路径。
EVAL_DATA = "ptb.valid"           # 验证数据路径。
TEST_DATA = "ptb.test"            # 测试数据路径。
HIDDEN_SIZE = 300                 # 隐藏层规模。
NUM_LAYERS = 2                    # 深层循环神经网络中LSTM结构的层数。
VOCAB_SIZE = 10000                # 词典规模。
TRAIN_BATCH_SIZE = 20             # 训练数据batch的大小。
TRAIN_NUM_STEP = 35               # 训练数据截断长度。

EVAL_BATCH_SIZE = 1               # 测试数据batch的大小。
EVAL_NUM_STEP = 1                 # 测试数据截断长度。
NUM_EPOCH = 5                     # 使用训练数据的轮数。
LSTM_KEEP_PROB = 0.9              # LSTM节点不被dropout的概率。
EMBEDDING_KEEP_PROB = 0.9         # 词向量不被dropout的概率。
MAX_GRAD_NORM = 5                 # 用于控制梯度膨胀的梯度大小上限。
SHARE_EMB_AND_SOFTMAX = True      # 在Softmax层和词向量层之间共享参数。

In [48]:
class PTBModel(object):
    def __init__(self, is_training, batch_size, num_steps):
        # 记录使用的batch大小和截断长度。
        self.batch_size = batch_size
        self.num_steps = num_steps
        
        # 定义每一步的输入和预期输出。两者的维度都是[batch_size, num_steps]。
        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])
        
        # 定义使用LSTM结构为循环体结构且使用dropout的深层循环神经网络。
        dropout_keep_prob = LSTM_KEEP_PROB if is_training else 1.0
        lstm_cells = [
            tf.nn.rnn_cell.DropoutWrapper(
                tf.nn.rnn_cell.BasicLSTMCell(HIDDEN_SIZE),
                output_keep_prob=dropout_keep_prob)
            for _ in range(NUM_LAYERS)]     
        cell = tf.nn.rnn_cell.MultiRNNCell(lstm_cells)            
        
        # 初始化最初的状态，即全零的向量。这个量只在每个epoch初始化第一个batch
        # 时使用。
        self.initial_state = cell.zero_state(batch_size, tf.float32)

        # 定义单词的词向量矩阵。
        embedding = tf.get_variable("embedding", [VOCAB_SIZE, HIDDEN_SIZE])
        
        # 将输入单词转化为词向量。
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)
        
        # 只在训练时使用dropout。
        if is_training:
            inputs = tf.nn.dropout(inputs, EMBEDDING_KEEP_PROB)
        
        # 定义输出列表。在这里先将不同时刻LSTM结构的输出收集起来，再一起提供给
        # softmax层。
        outputs = []
        state = self.initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                cell_output, state = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output) 
        # 把输出队列展开成[batch, hidden_size*num_steps]的形状，然后再
        # reshape成[batch*numsteps, hidden_size]的形状。
        output = tf.reshape(tf.concat(outputs, 1), [-1, HIDDEN_SIZE])
        
        # Softmax层：将RNN在每个位置上的输出转化为各个单词的logits。
        if SHARE_EMB_AND_SOFTMAX:
            weight = tf.transpose(embedding)
        else:
            weight = tf.get_variable("weight", [HIDDEN_SIZE, VOCAB_SIZE])
        bias = tf.get_variable("bias", [VOCAB_SIZE])
        logits = tf.matmul(output, weight) + bias
        
        # 定义交叉熵损失函数和平均损失。
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=tf.reshape(self.targets, shape=[-1]),
            logits=logits)
        self.cost = tf.reduce_sum(loss) / batch_size
        self.final_state = state
        
        #只在训练模型时定义反向传播操作
        if not is_training: return
        
        #控制梯度大小，定义优化方法和训练步骤
        trainable_variables = tf.trainable_variables()
        
        grads, _ = tf.clip_by_global_norm(
                tf.gradients(self.cost, trainable_variables), MAX_GRAD_NORM)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
        self.train_op = optimizer.apply_gradients(zip(grads, trainable_variables))

In [49]:
def run_epoch(session, model, batches, train_op, output_log, step):
    total_costs = 0.0
    iters = 0
    state = session.run(model.initial_state)
    
    for x, y in batches:
        cost, state, _ = session.run(
                [model.cost, model.final_state, train_op],
                {model.input_data: x, model.targets: y, model.initial_state: state})
        total_costs += cost
        iters += model.num_steps
        
        if output_log and step % 100 == 0:
            print("After {} steps, perplexity is {}".format(step, np.exp(total_costs / iters)))
            
        step += 1
        
    return step, np.exp(total_costs / iters)

def read_data(file_path):
    with open(file_path, "r") as fin:
        #将整个文档读进一个长字符串
        id_string = ' '.join([line.strip() for line in fin.readlines()])
    id_list = [int(w) for w in id_string.split()]
    return id_list

def make_batches(id_list, batch_size, num_step):
    #计算总的batch数量 每个batch包含的单词数量是batch_size * num_step
    num_batches = (len(id_list) - 1) // (batch_size * num_step)
    
    #将数据整理成一个维度为[batch_size, num_batches * num_step]的二维数组
    data = np.array(id_list[: num_batches * batch_size * num_step])
    data = np.reshape(data, [batch_size, num_batches * num_step])
    #沿着第二个维度将数据切分成num_batches个batch,存入一个数组
    data_batches = np.split(data, num_batches, axis=1)
    
    #重复上述操作，但是每个位置向右移动一位，这里得到的是RNN每一步输出所需要的下一个单词
    label = np.array(id_list[1: num_batches * batch_size * num_step + 1])
    label = np.reshape(label, [batch_size, num_batches * num_step])
    label_batches = np.split(label, num_batches, axis=1)
    
    #返回一个长度为num_batches的数组，其中每一项包括一个data矩阵和一个label矩阵
    return list(zip(data_batches, label_batches))

In [50]:
def main():
    initializer = tf.random_uniform_initializer(-0.05, 0.05)
    
    with tf.variable_scope("language_model", reuse=None, initializer=initializer):
        train_model = PTBModel(True, TRAIN_BATCH_SIZE, TRAIN_NUM_STEP)
        
    with tf.variable_scope("language_model", reuse=True, initializer=initializer):
        eval_model = PTBModel(False, EVAL_BATCH_SIZE, EVAL_NUM_STEP)

    # 训练模型。
    with tf.Session() as sess:
        tf.global_variables_initializer().run()
        train_batches = make_batches(read_data(TRAIN_DATA), TRAIN_BATCH_SIZE, TRAIN_NUM_STEP)
        eval_batches = make_batches(read_data(EVAL_DATA), EVAL_BATCH_SIZE, EVAL_NUM_STEP)
        test_batches = make_batches(read_data(TEST_DATA), EVAL_BATCH_SIZE, EVAL_NUM_STEP)

        step = 0
        for i in range(NUM_EPOCH):
            print("In iteration: {}".format(i + 1))
            step, train_pplx = run_epoch(sess, train_model, train_batches, 
                                         train_model.train_op, True, step)
            print("Epoch: {} Train Perplexity: {}".format(i + 1, train_pplx))
            
            _, eval_pplx = run_epoch(sess, eval_model, eval_batches, 
                                     tf.no_op(), False, 0)
            print("Epoch: {} Eval Perplexity: {}".format(i + 1, eval_pplx))
            
        _, test_pplx = run_epoch(sess, eval_model, test_batches,
                                 tf.no_op(), False, 0)
        print("Test Perplexity: {}".format(test_pplx))

if __name__ == "__main__":
    tf.reset_default_graph()
    main()

In iteration: 1
After 0 steps, perplexity is 9986.092218903921


KeyboardInterrupt: 