# テキストデータの操作

In [6]:
# 単語と文字のonehotエンコーディング
import numpy as np
samples = ['the cat sat on the mat.', 'the dog ate my homework.']

# 文章に含まれるすべてのトークンのインデックスを構築
token_index = {}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1
            
max_length = 10

# 結果の格納
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.

In [7]:
# 文字レベルでのonehotエンコーディング
import string

samples = ['The cat sat on the mat.', 'The dog ate my homework.']
characters = string.printable
token_index = dict(zip(characters, range(1, len(characters) + 1)))

max_length = 50
results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))
for i, sample in enumerate(samples):
    for j, character in enumerate(sample[:max_length]):
        index = token_index.get(character)
        results[i, j, index] = 1.

In [8]:
# kerasを使った単語レベルでのonehotエンコーディング
from keras.preprocessing.text import Tokenizer

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

# 出現頻度が最も高い1000個の単語だけを処理するように設定されたトークナイザを作成
tokenizer = Tokenizer(num_words=1000)

# 単語のインデックスを構築
tokenizer.fit_on_texts(samples)

# 文字列を整数ｗのインデックスのリストに変換
sequences = tokenizer.texts_to_sequences(samples)

# 二値のonehotエンコーディング表現を直接取得することも可能
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

#　復元
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Using TensorFlow backend.


Found 9 unique tokens.


In [9]:
from keras.layers import Embedding

embedding_layer = Embedding(1000, 64)

In [10]:
from keras.datasets import imdb
from keras import preprocessing

# 特徴量として考慮する単語の数
max_features = 10000

# 特徴量として最も出現頻度の高い単語のうち、この数の単語を残してテキストをカット
maxlen = 20

# ロード
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# 2次元テンソルへ変換
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [11]:
from keras.models import Sequential
from keras.layers import Flatten, Dense

model = Sequential()

model.add(Embedding(10000, 8, input_length=maxlen))

model.add(Flatten())

model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
