# Chap3 の内容を1時間で Keras に置き換える

## 参考
* [Keras Documentation](https://keras.io/ja/)
  * Kerasの公式ドキュメントの翻訳
* [nzw0301/keras-examples (GitHub)](https://github.com/nzw0301/keras-examples/blob/master/CBoW.ipynb)
  * KerasでCBOWを実装した（らしい）例
* [oreilly-japan/deep-learning-from-scratch-2(GitHub)](https://github.com/oreilly-japan/deep-learning-from-scratch-2.git)
  * 「ゼロから作る Deep Learning 2」のサンプルコード

In [28]:
import numpy as np

In [56]:
from keras.utils.data_utils import get_file

path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
corpus = open(path).readlines()[:300]
#print("corpus = {cps}".format(cps=corpus))
print("corpus.shape={len}".format(len=np.array(corpus).shape))
corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]
print("corpus = {cps}".format(cps=corpus))
print("corpus.shape={len}".format(len=np.array(corpus).shape))

corpus.shape=(300,)
corpus = ['\ufeffProject Gutenberg’s Alice’s Adventures in Wonderland, by Lewis Carroll\n', 'This eBook is for the use of anyone anywhere at no cost and with\n', 'almost no restrictions whatsoever.  You may copy it, give it away or\n', 're-use it under the terms of the Project Gutenberg License included\n', 'with this eBook or online at www.gutenberg.org\n', 'Title: Alice’s Adventures in Wonderland\n', 'Author: Lewis Carroll\n', 'Posting Date: June 25, 2008 [EBook #11]\n', 'Release Date: March, 1994\n', 'Last Updated: October 6, 2016\n', 'Character set encoding: UTF-8\n', '*** START OF THIS PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***\n', 'ALICE’S ADVENTURES IN WONDERLAND\n', 'THE MILLENNIUM FULCRUM EDITION 3.0\n', 'CHAPTER I. Down the Rabbit-Hole\n', 'Alice was beginning to get very tired of sitting by her sister on the\n', 'bank, and of having nothing to do: once or twice she had peeped into the\n', 'book her sister was reading, but it had no pictu

In [57]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
print("corpus = {cps}".format(cps=corpus))
print("corpus.shape={len}".format(len=np.array(corpus).shape))

corpus = [[348, 349, 65, 115, 10, 116, 57, 196, 197], [22, 117, 66, 17, 1, 67, 8, 350, 351, 29, 38, 352, 4, 27], [146, 38, 353, 354, 16, 355, 356, 5, 198, 5, 147, 32], [357, 67, 5, 118, 1, 358, 8, 1, 199, 148, 359, 360], [27, 22, 117, 32, 361, 29, 362, 148, 363], [364, 65, 115, 10, 116], [365, 196, 197], [366, 200, 367, 368, 369, 117, 370], [371, 200, 372, 373], [374, 375, 376, 377, 378], [379, 201, 380, 381, 382], [383, 8, 22, 199, 148, 117, 65, 115, 10, 116], [65, 115, 10, 116], [1, 384, 385, 386, 387, 388], [202, 13, 15, 1, 39, 119], [11, 6, 389, 3, 58, 19, 203, 8, 390, 57, 12, 204, 20, 1], [391, 4, 8, 149, 68, 3, 59, 91, 32, 392, 2, 21, 393, 43, 1], [120, 12, 204, 6, 394, 18, 5, 21, 38, 150, 32, 395, 10], [5, 75, 44, 66, 1, 67, 8, 7, 120, 9, 49, 11, 396, 150, 32], [25, 2, 6, 205, 10, 12, 151, 206, 23, 92, 23, 2, 50, 17, 1], [152, 397, 207, 12, 208, 19, 209, 4, 210, 211, 1, 398], [8, 399, 7, 400, 401, 60, 24, 402, 1, 212, 8, 121, 34, 4], [403, 1, 404, 40, 93, 7, 94, 39, 27, 405, 153

In [58]:
# https://keras.io/ja/preprocessing/text/
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence

text = 'You say goodbye and I say hello.'
text = text.replace('.', ' .') # ピリオドを分割するため
print("text = {txt}".format(txt=text))

# 単語に分割するだけなら text_to_word_sequence 関数で実行できる
# ピリオドをフィルタされたくないので，デフォルト値からピリオドを除いた値を与える
corpus = text_to_word_sequence(text, filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
print("corpus = {cps}".format(cps=corpus))

print("-"*20)

# ベクトル化のため，keras の Tokenizer クラスを使いたい
# 単語がアルファベットレベルまで分割されてしまう??
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
corpus2 = tokenizer.texts_to_sequences(text)
matrix = tokenizer.texts_to_matrix(text)
print("corpus = {cps}".format(cps=corpus2))
print("corpus.shape={len}".format(len=np.array(corpus2).shape))
print("matrix = {mtr}".format(mtr=matrix))


text = You say goodbye and I say hello .
corpus = ['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']
--------------------
corpus = [[1], [2], [8], [], [4], [3], [1], [], [9], [2], [2], [5], [10], [1], [6], [], [3], [11], [5], [], [12], [], [4], [3], [1], [], [13], [6], [7], [7], [2], [], []]
corpus.shape=(33,)
matrix = [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 