In [13]:
import os
import re
import string
import requests
import numpy as np
import collections
import random
import pickle
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.framework import ops
ops.reset_default_graph()

In [14]:
sess = tf.Session()

# RNNのパラメータを設定
num_layers = 3 # RNNの層の数
min_word_freq = 5 # 出現頻度がこの値以下の単語を除外
rnn_size = 128 # RNNモデルのサイズ（埋め込みサイズに等しい）
epochs = 10 # データを処理する回数
batch_size = 100 # 一度にトレーニングするサンプルの数
learning_rate = 0.001 # 学習率（収束パラメータ）
training_seq_len = 50 # 前後（左右）の単語の数（左右に25単語ずつ）
embedding_size = rnn_size # 埋め込みサイズ（rnn_sizeに等しい）
save_every = 500 # モデルを保存する頻度（500回おき）
eval_every = 50 # テスト文を評価する頻度
# テストのリスト
prime_texts = ['thou art more', 'to be or not to', 'wherefore art thou']

In [15]:
# シェイクスピアのテキストデータをダウンロードして格納
data_dir = 'temp'
data_file = 'shakespeare.txt'
model_path = 'shakespeare_model'
full_model_dir = os.path.join(data_dir, model_path)

# ハイフンとアポストロフィ以外の句読点を削除
punctuation = string.punctuation
punctuation = ''.join([x for x in punctuation if x not in ['-', "'"]])

In [16]:
# モデルフォルダを作成
if not os.path.exists(full_model_dir):
    os.makedirs(full_model_dir)

# データフォルダを作成
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
print('Loading Shakespeare Data')
# ファイルがダウンロードされているかどうかを確認
if not os.path.isfile(os.path.join(data_dir, data_file)):
    print('Not found, downloading Shakespeare texts from www.gutenberg.org')
    shakespeare_url = 'http://www.gutenberg.org/cache/epub/100/pg100.txt'
    # シェイクスピアのテキストデータを取得
    response = requests.get(shakespeare_url)
    shakespeare_file = response.content
    # バイナリデータを文字列にデコード
    s_text = shakespeare_file.decode('utf-8')
    # 最初の部分に含まれている説明用の段落を削除
    s_text = s_text[7675:]
    # 改行を削除
    s_text = s_text.replace('\r\n', '')
    s_text = s_text.replace('\n', '')
    
    # ファイルに保存
    with open(os.path.join(data_dir, data_file), 'w') as out_conn:
        out_conn.write(s_text)
else:
    # ファイルがすでに保存されている場合は、そのファイルからデータを読み込む
    with open(os.path.join(data_dir, data_file), 'r') as file_conn:
        s_text = file_conn.read().replace('\n', '')

Loading Shakespeare Data


In [17]:
s_text = re.sub(r'[{}]'.format(punctuation), ' ', s_text)
s_text = re.sub('\s+', ' ', s_text ).strip().lower()

# 文字ごとに分割
char_list = list(s_text)

In [18]:
def build_vocab(characters):
    character_counts = collections.Counter(characters)
    # 文字からインデックスへのマッピングを作成
    chars = character_counts.keys()
    vocab_to_ix_dict = {key:(ix+1) for ix, key in enumerate(chars)}
    # 不明なキーのインデックスとして0を追加
    vocab_to_ix_dict['unknown']=0
    # インデックスから語彙へのマッピングを作成
    ix_to_vocab_dict = {val:key for key,val in vocab_to_ix_dict.items()}
    
    return(ix_to_vocab_dict, vocab_to_ix_dict)

# シェイクスピアの語彙を作成
ix2vocab, vocab2ix = build_vocab(char_list)
vocab_size = len(ix2vocab)

In [19]:
# テキストを単語ベクトルに変換
s_text_ix = []
for x in char_list:
    try:
        s_text_ix.append(vocab2ix[x])
    except:
        s_text_ix.append(0)
s_text_ix = np.array(s_text_ix)

In [20]:
# LSTM RNN モデルを作成
class LSTM_Model():
    # モデルのすべての変数と演算を定義
    def __init__(self, embedding_size, rnn_size, num_layers, batch_size, learning_rate,
                 training_seq_len, vocab_size, infer_sample=False):
        self.embedding_size = embedding_size
        self.rnn_size = rnn_size
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        self.infer_sample = infer_sample
        self.learning_rate = learning_rate
        
        if infer_sample:
            self.batch_size = 1
            self.training_seq_len = 1
        else:
            self.batch_size = batch_size
            self.training_seq_len = training_seq_len
        
        self.lstm_cell = tf.contrib.rnn.BasicLSTMCell(self.rnn_size)
        self.lstm_cell = tf.contrib.rnn.MultiRNNCell([self.lstm_cell for _ in range(self.num_layers)])
        self.initial_state = self.lstm_cell.zero_state(self.batch_size, tf.float32)
        
        self.x_data = tf.placeholder(tf.int32, [self.batch_size, self.training_seq_len])
        self.y_output = tf.placeholder(tf.int32, [self.batch_size, self.training_seq_len])
        
        with tf.variable_scope('lstm_vars'):
            # ソフトマックスの出力の重み
            W = tf.get_variable('W', [self.rnn_size, self.vocab_size], tf.float32, tf.random_normal_initializer())
            b = tf.get_variable('b', [self.vocab_size], tf.float32, tf.constant_initializer(0.0))
        
            # 埋め込みを定義
            embedding_mat = tf.get_variable('embedding_mat', [self.vocab_size, self.embedding_size],
                                            tf.float32, tf.random_normal_initializer())
                                            
            embedding_output = tf.nn.embedding_lookup(embedding_mat, self.x_data)
            rnn_inputs = tf.split(axis=1, num_or_size_splits=self.training_seq_len, value=embedding_output)
            rnn_inputs_trimmed = [tf.squeeze(x, [1]) for x in rnn_inputs]
                
        decoder = tf.contrib.legacy_seq2seq.rnn_decoder
        outputs, last_state = decoder(rnn_inputs_trimmed,
                                      self.initial_state,
                                      self.lstm_cell)
        # 推測されていない出力
        output = tf.reshape(tf.concat(axis=1, values=outputs), [-1, self.rnn_size])
        # ロジットと出力
        self.logit_output = tf.matmul(output, W) + b
        self.model_output = tf.nn.softmax(self.logit_output)
        
        loss_fun = tf.contrib.legacy_seq2seq.sequence_loss_by_example
        loss = loss_fun([self.logit_output],[tf.reshape(self.y_output, [-1])],
                [tf.ones([self.batch_size * self.training_seq_len])],
                self.vocab_size)
        self.cost = tf.reduce_sum(loss) / (self.batch_size * self.training_seq_len)
        self.final_state = last_state
        gradients, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tf.trainable_variables()), 4.5)
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.train_op = optimizer.apply_gradients(zip(gradients, tf.trainable_variables()))
        
    # サンプル（単語）をループ処理してテキストを生成
    def sample(self, sess, words=ix2vocab, vocab=vocab2ix, num=20, prime_text='thou art'):
        state = sess.run(self.lstm_cell.zero_state(1, tf.float32))
        char_list = list(prime_text)
        for char in char_list[:-1]:
            x = np.zeros((1, 1))
            x[0, 0] = vocab[char]
            feed_dict = {self.x_data: x, self.initial_state:state}
            [state] = sess.run([self.final_state], feed_dict=feed_dict)

        out_sentence = prime_text
        char = char_list[-1]
        for n in range(num):
            x = np.zeros((1, 1))
            x[0, 0] = vocab[char]
            feed_dict = {self.x_data: x, self.initial_state:state}
            [model_output, state] = sess.run([self.model_output, self.final_state], feed_dict=feed_dict)
            sample = np.argmax(model_output[0])
            if sample == 0:
                break
            char = words[sample]
            out_sentence = out_sentence + char
        return(out_sentence)

In [21]:
# LSTM モデルを定義
lstm_model = LSTM_Model(embedding_size, rnn_size, num_layers, batch_size, learning_rate,
                        training_seq_len, vocab_size)

# テストモデル定義。このスコープをテストに再利用する
with tf.variable_scope(tf.get_variable_scope(), reuse=True):
    test_lstm_model = LSTM_Model(embedding_size, rnn_size, num_layers, batch_size, learning_rate,
                                 training_seq_len, vocab_size, infer_sample=True)

In [22]:
# モデルの保存関数を定義
saver = tf.train.Saver(tf.global_variables())

# エポックごとにバッチを作成
num_batches = int(len(s_text_ix)/(batch_size * training_seq_len)) + 1
# テキストインデックスを同じサイズの部分配列に分割
batches = np.array_split(s_text_ix, num_batches)
# それらの部分配列の形状を [batch_size, training_seq_len] に変更
batches = [np.resize(x, [batch_size, training_seq_len]) for x in batches]

# すべての変数を初期化
init = tf.global_variables_initializer()
sess.run(init)

In [None]:
train_loss = []
iteration_count = 1
for epoch in range(epochs):
    # 単語インデックスをシャッフル
    random.shuffle(batches)
    # シャッフルしたバッチから目的値を作成
    targets = [np.roll(x, -1, axis=1) for x in batches]
    # エポックの実行を開始
    print('Starting Epoch #{} of {}.'.format(epoch+1, epochs))
    # LSTMの初期状態をエポックごとにリセット
    state = sess.run(lstm_model.initial_state)
    for ix, batch in enumerate(batches):
        training_dict = {lstm_model.x_data: batch, lstm_model.y_output: targets[ix]}
        # 各RNNセルの初期状態を更新する
        for i, (c, h) in enumerate(lstm_model.initial_state):
                    training_dict[c] = state[i].c
                    training_dict[h] = state[i].h
        
        temp_loss, state, _ = sess.run([lstm_model.cost, lstm_model.final_state, lstm_model.train_op],
                                       feed_dict=training_dict)
        train_loss.append(temp_loss)
        
        # 10回おきにステータスを出力
        if iteration_count % 10 == 0:
            summary_nums = (iteration_count, epoch+1, ix+1, num_batches+1, temp_loss)
            print('Iteration: {}, Epoch: {}, Batch: {} out of {}, Loss: {:.2f}'.format(*summary_nums))
        
        # モデルと語彙を保存
        if iteration_count % save_every == 0:
            # モデルを保存
            model_file_name = os.path.join(full_model_dir, 'model')
            saver.save(sess, model_file_name, global_step = iteration_count)
            print('Model Saved To: {}'.format(model_file_name))
            # 語彙を保存
            dictionary_file = os.path.join(full_model_dir, 'vocab.pkl')
            with open(dictionary_file, 'wb') as dict_file_conn:
                pickle.dump([vocab2ix, ix2vocab], dict_file_conn)
        
        if iteration_count % eval_every == 0:
            for sample in prime_texts:
                print(test_lstm_model.sample(sess, ix2vocab, vocab2ix, num=10, prime_text=sample))
                
        iteration_count += 1

Starting Epoch #1 of 10.
Iteration: 10, Epoch: 1, Batch: 10 out of 950, Loss: 3.00
Iteration: 20, Epoch: 1, Batch: 20 out of 950, Loss: 2.88
Iteration: 30, Epoch: 1, Batch: 30 out of 950, Loss: 2.87
Iteration: 40, Epoch: 1, Batch: 40 out of 950, Loss: 2.86
Iteration: 50, Epoch: 1, Batch: 50 out of 950, Loss: 2.82
thou art more  o oo  oo
to be or not to  ee e ae 
wherefore art thou   oo o o 
Iteration: 60, Epoch: 1, Batch: 60 out of 950, Loss: 2.79
Iteration: 70, Epoch: 1, Batch: 70 out of 950, Loss: 2.82
Iteration: 80, Epoch: 1, Batch: 80 out of 950, Loss: 2.75
Iteration: 90, Epoch: 1, Batch: 90 out of 950, Loss: 2.71
Iteration: 100, Epoch: 1, Batch: 100 out of 950, Loss: 2.64
thou art more o oo the 
to be or not to he oe to 
wherefore art thou it ho o o
Iteration: 110, Epoch: 1, Batch: 110 out of 950, Loss: 2.60
Iteration: 120, Epoch: 1, Batch: 120 out of 950, Loss: 2.50
Iteration: 130, Epoch: 1, Batch: 130 out of 950, Loss: 2.50
Iteration: 140, Epoch: 1, Batch: 140 out of 950, Loss: 

Iteration: 1080, Epoch: 2, Batch: 131 out of 950, Loss: 1.86
Iteration: 1090, Epoch: 2, Batch: 141 out of 950, Loss: 1.84
Iteration: 1100, Epoch: 2, Batch: 151 out of 950, Loss: 1.91
thou art more the so th
to be or not to the so th
wherefore art thou shall the
Iteration: 1110, Epoch: 2, Batch: 161 out of 950, Loss: 1.80
Iteration: 1120, Epoch: 2, Batch: 171 out of 950, Loss: 1.83
Iteration: 1130, Epoch: 2, Batch: 181 out of 950, Loss: 1.81
Iteration: 1140, Epoch: 2, Batch: 191 out of 950, Loss: 1.82
Iteration: 1150, Epoch: 2, Batch: 201 out of 950, Loss: 1.86
thou art more and the c
to be or not to the come 
wherefore art thou shall the
Iteration: 1160, Epoch: 2, Batch: 211 out of 950, Loss: 1.88
Iteration: 1170, Epoch: 2, Batch: 221 out of 950, Loss: 1.80
Iteration: 1180, Epoch: 2, Batch: 231 out of 950, Loss: 1.81
Iteration: 1190, Epoch: 2, Batch: 241 out of 950, Loss: 1.77
Iteration: 1200, Epoch: 2, Batch: 251 out of 950, Loss: 1.85
thou art more the see t
to be or not to the see t

Iteration: 2140, Epoch: 3, Batch: 242 out of 950, Loss: 1.75
Iteration: 2150, Epoch: 3, Batch: 252 out of 950, Loss: 1.65
thou art more the see t
to be or not to the see t
wherefore art thou are the s
Iteration: 2160, Epoch: 3, Batch: 262 out of 950, Loss: 1.69
Iteration: 2170, Epoch: 3, Batch: 272 out of 950, Loss: 1.65
Iteration: 2180, Epoch: 3, Batch: 282 out of 950, Loss: 1.73
Iteration: 2190, Epoch: 3, Batch: 292 out of 950, Loss: 1.81
Iteration: 2200, Epoch: 3, Batch: 302 out of 950, Loss: 1.78
thou art more the will 
to be or not to the will 
wherefore art thou art the w
Iteration: 2210, Epoch: 3, Batch: 312 out of 950, Loss: 1.71
Iteration: 2220, Epoch: 3, Batch: 322 out of 950, Loss: 1.77
Iteration: 2230, Epoch: 3, Batch: 332 out of 950, Loss: 1.73
Iteration: 2240, Epoch: 3, Batch: 342 out of 950, Loss: 1.75
Iteration: 2250, Epoch: 3, Batch: 352 out of 950, Loss: 1.72
thou art more the see t
to be or not to the see t
wherefore art thou art the s
Iteration: 2260, Epoch: 3, Batc

Iteration: 3200, Epoch: 4, Batch: 353 out of 950, Loss: 1.63
thou art more the world
to be or not to the world
wherefore art thou art the w
Iteration: 3210, Epoch: 4, Batch: 363 out of 950, Loss: 1.64
Iteration: 3220, Epoch: 4, Batch: 373 out of 950, Loss: 1.55
Iteration: 3230, Epoch: 4, Batch: 383 out of 950, Loss: 1.66
Iteration: 3240, Epoch: 4, Batch: 393 out of 950, Loss: 1.55
Iteration: 3250, Epoch: 4, Batch: 403 out of 950, Loss: 1.65
thou art more the seek 
to be or not to the seek 
wherefore art thou have he h
Iteration: 3260, Epoch: 4, Batch: 413 out of 950, Loss: 1.61
Iteration: 3270, Epoch: 4, Batch: 423 out of 950, Loss: 1.59
Iteration: 3280, Epoch: 4, Batch: 433 out of 950, Loss: 1.67
Iteration: 3290, Epoch: 4, Batch: 443 out of 950, Loss: 1.67
Iteration: 3300, Epoch: 4, Batch: 453 out of 950, Loss: 1.64
thou art more the sent 
to be or not to the sent 
wherefore art thou shall be 
Iteration: 3310, Epoch: 4, Batch: 463 out of 950, Loss: 1.60
Iteration: 3320, Epoch: 4, Batc

thou art more that i sh
to be or not to the some 
wherefore art thou had the s
Iteration: 4260, Epoch: 5, Batch: 464 out of 950, Loss: 1.61
Iteration: 4270, Epoch: 5, Batch: 474 out of 950, Loss: 1.61
Iteration: 4280, Epoch: 5, Batch: 484 out of 950, Loss: 1.69
Iteration: 4290, Epoch: 5, Batch: 494 out of 950, Loss: 1.63
Iteration: 4300, Epoch: 5, Batch: 504 out of 950, Loss: 1.70
thou art more the compl
to be or not to the serva
wherefore art thou shall be 
Iteration: 4310, Epoch: 5, Batch: 514 out of 950, Loss: 1.62
Iteration: 4320, Epoch: 5, Batch: 524 out of 950, Loss: 1.59
Iteration: 4330, Epoch: 5, Batch: 534 out of 950, Loss: 1.66
Iteration: 4340, Epoch: 5, Batch: 544 out of 950, Loss: 1.60
Iteration: 4350, Epoch: 5, Batch: 554 out of 950, Loss: 1.51
thou art more that i wi
to be or not to marry the
wherefore art thou art the s
Iteration: 4360, Epoch: 5, Batch: 564 out of 950, Loss: 1.58
Iteration: 4370, Epoch: 5, Batch: 574 out of 950, Loss: 1.63
Iteration: 4380, Epoch: 5, Batc

wherefore art thou shall be 
Iteration: 5310, Epoch: 6, Batch: 565 out of 950, Loss: 1.53
Iteration: 5320, Epoch: 6, Batch: 575 out of 950, Loss: 1.56
Iteration: 5330, Epoch: 6, Batch: 585 out of 950, Loss: 1.59
Iteration: 5340, Epoch: 6, Batch: 595 out of 950, Loss: 1.49
Iteration: 5350, Epoch: 6, Batch: 605 out of 950, Loss: 1.53
thou art more the sent 
to be or not to the sent 
wherefore art thou shall be 
Iteration: 5360, Epoch: 6, Batch: 615 out of 950, Loss: 1.61
Iteration: 5370, Epoch: 6, Batch: 625 out of 950, Loss: 1.59
Iteration: 5380, Epoch: 6, Batch: 635 out of 950, Loss: 1.54
Iteration: 5390, Epoch: 6, Batch: 645 out of 950, Loss: 1.54
Iteration: 5400, Epoch: 6, Batch: 655 out of 950, Loss: 1.68
thou art more that the 
to be or not to the stran
wherefore art thou shall be 
Iteration: 5410, Epoch: 6, Batch: 665 out of 950, Loss: 1.52
Iteration: 5420, Epoch: 6, Batch: 675 out of 950, Loss: 1.60
Iteration: 5430, Epoch: 6, Batch: 685 out of 950, Loss: 1.61
Iteration: 5440, Epo

Iteration: 6360, Epoch: 7, Batch: 666 out of 950, Loss: 1.67
Iteration: 6370, Epoch: 7, Batch: 676 out of 950, Loss: 1.59
Iteration: 6380, Epoch: 7, Batch: 686 out of 950, Loss: 1.63
Iteration: 6390, Epoch: 7, Batch: 696 out of 950, Loss: 1.50
Iteration: 6400, Epoch: 7, Batch: 706 out of 950, Loss: 1.59
thou art more than the 
to be or not to the stran
wherefore art thou shalt tho
Iteration: 6410, Epoch: 7, Batch: 716 out of 950, Loss: 1.42
Iteration: 6420, Epoch: 7, Batch: 726 out of 950, Loss: 1.43
Iteration: 6430, Epoch: 7, Batch: 736 out of 950, Loss: 1.57
Iteration: 6440, Epoch: 7, Batch: 746 out of 950, Loss: 1.56
Iteration: 6450, Epoch: 7, Batch: 756 out of 950, Loss: 1.61
thou art more that i wi
to be or not to the stree
wherefore art thou shalt the
Iteration: 6460, Epoch: 7, Batch: 766 out of 950, Loss: 1.53
Iteration: 6470, Epoch: 7, Batch: 776 out of 950, Loss: 1.48
Iteration: 6480, Epoch: 7, Batch: 786 out of 950, Loss: 1.56
Iteration: 6490, Epoch: 7, Batch: 796 out of 950,

Iteration: 7420, Epoch: 8, Batch: 777 out of 950, Loss: 1.47
Iteration: 7430, Epoch: 8, Batch: 787 out of 950, Loss: 1.55
Iteration: 7440, Epoch: 8, Batch: 797 out of 950, Loss: 1.54
Iteration: 7450, Epoch: 8, Batch: 807 out of 950, Loss: 1.52
thou art more the world
to be or not to the world
wherefore art thou hast the 
Iteration: 7460, Epoch: 8, Batch: 817 out of 950, Loss: 1.58
Iteration: 7470, Epoch: 8, Batch: 827 out of 950, Loss: 1.49
Iteration: 7480, Epoch: 8, Batch: 837 out of 950, Loss: 1.53
Iteration: 7490, Epoch: 8, Batch: 847 out of 950, Loss: 1.47
Iteration: 7500, Epoch: 8, Batch: 857 out of 950, Loss: 1.46
Model Saved To: temp/shakespeare_model/model
thou art more than the 
to be or not to the compl
wherefore art thou hast thou
Iteration: 7510, Epoch: 8, Batch: 867 out of 950, Loss: 1.51
Iteration: 7520, Epoch: 8, Batch: 877 out of 950, Loss: 1.61
Iteration: 7530, Epoch: 8, Batch: 887 out of 950, Loss: 1.54
Iteration: 7540, Epoch: 8, Batch: 897 out of 950, Loss: 1.46
Iter

Iteration: 8480, Epoch: 9, Batch: 888 out of 950, Loss: 1.43
Iteration: 8490, Epoch: 9, Batch: 898 out of 950, Loss: 1.39
Iteration: 8500, Epoch: 9, Batch: 908 out of 950, Loss: 1.53


In [None]:
plt.plot(train_loss, 'k-')
plt.title('Sequence to Sequence Loss')
plt.xlabel('Generation')
plt.ylabel('Loss')
plt.show()