In [1]:
from keras.layers import Dense,Activation,SimpleRNN
from keras.models import Sequential
import numpy as np
import codecs


Using TensorFlow backend.


In [3]:
# データは

# ファイルの読み込み
INPUT_FILE = "./data/alice_in_wonderland.txt"
with codecs.open(INPUT_FILE,"r",encoding="utf-8") as f:
    lines = []
    # すべての行について空の行でなければ
    # その行の端の空白を削除して小文字にしたものをlinesリストに追加
    for line in f:
        if len(line)!=0:
            lines.append(line.strip().lower())
text = " ".join(lines)

In [5]:
# textに出てくるすべての文字を集合で取得
chars = set(text)
nb_chars = len(chars)

# すべての文字列について索引を作成する
char2index = dict()
index2char = dict()
for i,c in enumerate(chars):
    char2index[c] = i
    index2char[i] = c
    


In [9]:
# 試しに文字からidを取得してみる
id_c = char2index["c"]
print(id_c)
#本当に正しいIDか確認
char_c = index2char[id_c]
print(char_c)

55
c


In [14]:
SEQLEN = 10
STEP   = 1

input_chars = []
label_chars = []
for i in range(0,len(text) - SEQLEN , STEP):
    input_chars.append(text[i:i+SEQLEN])  # i~i+SEQLEN-1番目のSEQLENの文字数テキストを取得
    label_chars.append(text[i + SEQLEN])  # SEQLEN番目のi文字のテキストを取得

In [15]:
# 入力、出力データをone-hotラベル化する

# まずは０で初期化した行列を作成する
X = np.zeros((len(input_chars),SEQLEN,nb_chars),dtype=np.bool)
y = np.zeros((len(input_chars),nb_chars),dtype=np.bool)

# 入力すべてのデータについて文字からidを取得しその列の値だけ1にする
for i , input_char in enumerate(input_chars):
    for j,ch in enumerate(input_char):
        X[i,j,char2index[ch]] = 1
    # その時の正解データの文字を取得しそこからidを取得しその列の値を1にする
    label_char = label_chars[i]
    y[i,char2index[label_char]] = 1

In [16]:
HIDDEN_SIZE = 128
BATCH_SIZE = 128
NUM_ITERATIONS = 25
NUM_EPOCHS_PER_ITERATION = 1
NUM_PREDS_PER_EPOCH = 100    

In [17]:
model = Sequential()
model.add(SimpleRNN(
    HIDDEN_SIZE,
    return_sequences=False,
    input_shape=(SEQLEN,nb_chars),
    unroll=True
))
model.add(Dense(nb_chars))
model.add(Activation("softmax"))
model.compile(loss="categorical_crossentropy",optimizer="rmsprop")

In [18]:
for iteration in range(NUM_ITERATIONS):
    print("=" * 50)
    print("Iteration #: {}".format(iteration))
    model.fit(X,y,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS_PER_ITERATION)
    
    test_idx = np.random.randint(len(input_chars))
    test_chars = input_chars[test_idx]
    print("Generationg from seed: {}".format(test_chars))
    print(test_chars,end="")
    
    for i in range(NUM_PREDS_PER_EPOCH):
        Xtest = np.zeros((1,SEQLEN,nb_chars))
        for j,ch in enumerate(test_chars):
            Xtest[0,j,char2index[ch]] = 1
        
        pred = model.predict(Xtest,verbose=0)[0]
        ypred = index2char[np.argmax(pred)]
        print(ypred,end="")
        test_chars = test_chars[1:] + ypred
    print()

Iteration #: 0
Epoch 1/1
Generationg from seed: of all thi
of all thin the the the the the the the the the the the the the the the the the the the the the the the the th
Iteration #: 1
Epoch 1/1
Generationg from seed:  tortoise,
 tortoise, and the groplone the mast on the said the groplone the mast on the said the groplone the mast on th
Iteration #: 2
Epoch 1/1
Generationg from seed: k. she pit
k. she pither all as in the wast of the southen the wast of the southen the wast of the southen the wast of th
Iteration #: 3
Epoch 1/1
Generationg from seed: use this e
use this eree say she said the say her she seat the cat on the the the her her her her her her her her her her
Iteration #: 4
Epoch 1/1
Generationg from seed:  very unea
 very uneas and the sere har seat and the mouse the mouse the mouse the mouse the mouse the mouse the mouse th
Iteration #: 5
Epoch 1/1
Generationg from seed: ‘i keep th
‘i keep the could be the looked and the morken the more the more the more the more the more