## 参考https://zhuanlan.zhihu.com/p/27234078 通过LSTM实现文本生成

## 分成四个部分， 分别是数据预处理，构件模型图，训练模型，生成文本

In [16]:
import tensorflow as tf
import numpy as np
import time
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
with open('./data/anna.txt', 'r') as f:
    text = f.read()
vocab = set(text)


## 这时候我才发现原来作者实现的是面向字符的，也就是一个个字母，这时候我就产生了一个疑问，如果保证生成的单词是正确的。当然，之后会尝试中文的，这也是比赛中nn的预处理要做的


In [9]:
vocab_to_int = {c:i for i, c in enumerate(vocab)}
int_to_vocab = dict(enumerate(vocab))

#对文本转码
encoded = np.array([vocab_to_int[c] for c in text], dtype=np.int32)

## 接下来划分数据集，使用mini_batch 的方法，rnn的minibatch和dnn的不太一样，因为rnn中涉及到记忆，这里面就有了一个sequence_length,假设序列长度为M，我们有N个序列，这样的情况下batch就有N * M个字符，这样我们就有count/(N * M)个batch

In [47]:
def mini_batch(arr, n_seqs, n_size):
    #arr :待划分的数组
    #n_seqs为序列的个数
    #n_size为序列的长度
    batch_size = n_seqs * n_size
    n_batch = int(len(arr) / batch_size)
    
    arr = arr[: batch_size * n_batch]
    
    arr = arr.reshape((n_seqs, -1))
    
    for i in range(0, arr.shape[1], n_size):
        x = arr[:, i:i+n_size]
        #y会比x向后错一位
        y = np.zeros_like(x)
        y[:, :-1], y[:, -1] = x[:, 1:], x[:, 0]
        yield x, y

In [67]:
#输入层
def build_inputs(num_seqs, num_size):
    #输入层的输入等价于mini_batch 的结果
    x = tf.placeholder(tf.int32, shape=[num_seqs, num_size], name='inputs')
    y = tf.placeholder(tf.int32, shape=[num_seqs, num_size], name='targets')
    
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    return x, y, keep_prob

In [68]:
#构件lstm层
def build_lstm(lstm_size, num_layers, batch_size, keep_prob):
    lstm_cells = []
    for i in range(num_layers):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        #添加drop层
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        lstm_cells.append(drop)
    #堆叠
    cell = tf.contrib.rnn.MultiRNNCell(lstm_cells)
    initial_state = cell.zero_state(batch_size, tf.float32)
    return cell, initial_state

In [69]:
#构件输出层
def build_output(lstm_output, in_size, output_size):
    seq_output = tf.concat(lstm_output, axis=1)
    x = tf.reshape(seq_output, [-1, in_size])
    
    with tf.variable_scope("softmax"):
        w = tf.Variable(tf.truncated_normal([in_size, output_size], stddev = 0.1))
        b = tf.Variable(tf.zeros(output_size))
    
    logits = tf.matmul(x, w) + b
    out = tf.nn.softmax(logits, name='predictions')
    return out, logits

In [70]:
def build_loss(logits, targets, num_classes):
    y_one_hot = tf.one_hot(targets,num_classes)
    y_reshaped = tf.reshape(y_one_hot, logits.get_shape())
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_reshaped))
    return loss

In [71]:


def build_optimizer(loss, learning_rate, grad_clip):
    ''' 
    构造Optimizer
   
    loss: 损失
    learning_rate: 学习率
    
    '''
    
    # 使用clipping gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(learning_rate)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

In [72]:
class CharRNN:
    def __init__(self, num_classes, batch_size=64, num_steps=50, lstm_size=128, num_layers=2,
                learning_rate=0.01, grad_clip=5, sampling=False):
        #如果sampling是true，就采用sgd
        if sampling:
            batch_size, num_steps = 1, 1
        else:
            batch_size, num_steps = batch_size, num_steps
        
        tf.reset_default_graph()
        #构建输入层
        self.inputs, self.targets, self.keep_prob = build_inputs(batch_size, num_steps)
        #构建lstm层
        cell, self.initial_state = build_lstm(lstm_size, num_layers, batch_size, self.keep_prob)
        #构建输出层
        x_one_hot = tf.one_hot(self.inputs, num_classes)
        
        outputs, state = tf.nn.dynamic_rnn(cell, x_one_hot, initial_state=self.initial_state)
        self.final_state = state
        self.prediciton, self.logits = build_output(outputs, lstm_size, num_classes)
        
        self.loss = build_loss(self.logits, self.targets, num_classes)
        self.optimizer = build_optimizer(self.loss, learning_rate, grad_clip)

        

In [77]:
batch_size = 100
num_steps = 100
lstm_size = 512
num_layers = 2
learning_rate = 0.001 
keep_prob = 0.5 

epochs = 20
save_every_n = 200
model = CharRNN(len(vocab), batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_layers=num_layers, 
                learning_rate=learning_rate)

In [78]:
saver = tf.train.Saver(max_to_keep=100)
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    counter = 0
    for epoch in range(epochs):
        new_state = sess.run(model.initial_state)
        loss = 0
        for x, y in mini_batch(encoded, batch_size, num_steps):
            counter += 1
            start = time.time()
            feed = {model.inputs:x,
                    model.targets:y,
                    model.keep_prob:keep_prob,
                    model.initial_state: new_state
                   }
            batch_loss, new_state, _ = sess.run([model.loss, model.final_state, model.optimizer], feed_dict=feed)
            end = time.time()
            
            if counter % 100 == 0:
                print('轮数: {}/{}... '.format(e+1, epochs),
                      '训练步数: {}... '.format(counter),
                      '训练误差: {:.4f}... '.format(batch_loss),
                      '{:.4f} sec/batch'.format((end-start)))

            if (counter % save_every_n == 0):
                saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))
    
    saver.save(sess, "checkpoints/i{}_l{}.ckpt".format(counter, lstm_size))

KeyboardInterrupt: 

In [79]:
def pick_top_k(preds, vocab_size, top_n=5):
    p = np.squeeze(preds)
    p[np.argsort(p)[:-top_n]] = 0
    p = p/np.sum(p)
    c = np.random.choice(vocab_size, 1, p=p)[0]
    return c

In [None]:
def sample(checkpoint, n_samples, lstm_size, vocab_size, prime="The "):
    """
    生成新文本
    
    checkpoint: 某一轮迭代的参数文件
    n_sample: 新闻本的字符长度
    lstm_size: 隐层结点数
    vocab_size
    prime: 起始文本
    """
    samples = [c for c in prime]
    # sampling=True意味着batch的size=1 x 1
    model = CharRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1,1))
            x[0][0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state
                   }
            preds, new_state = sess.run([model.prediction, model.final_state], feed_dict=feed)
        c = pick_top_k(preds, len(vocab))
        samples.append(int_to_vocab[c])
        
        for i in range(n_samples):
            x[0, 0] = c
            
    saver = tf.train.Saver()
    with tf.Session() as sess:
        # 加载模型参数，恢复训练
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        for c in prime:
            x = np.zeros((1, 1))
            # 输入单个字符
            x[0,0] = vocab_to_int[c]
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

        c = pick_top_n(preds, len(vocab))
        # 添加字符到samples中
        samples.append(int_to_vocab[c])
        
        # 不断生成字符，直到达到指定数目
        for i in range(n_samples):
            x[0,0] = c
            feed = {model.inputs: x,
                    model.keep_prob: 1.,
                    model.initial_state: new_state}
            preds, new_state = sess.run([model.prediction, model.final_state], 
                                         feed_dict=feed)

            c = pick_top_n(preds, len(vocab))
            samples.append(int_to_vocab[c])
        
    return ''.join(samples)