In [1]:
import os
import collections

from keras.callbacks import TensorBoard
from keras.layers import Dense, Dropout, Conv1D, Embedding, GlobalMaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import nltk
import numpy as np
import codecs

np.random.seed(42)

INPUT_FILE = os.path.join(os.path.dirname("__file__"),
                          "data/training.txt")
LOG_DIR = os.path.join(os.path.dirname("__file__"), "logs")
VOCAB_SIZE = 5000
EMBED_SIZE = 100
NUM_FILTERS = 256
NUM_WORDS = 3
BATCH_SIZE = 64
NUM_EPOCHS = 20

Using TensorFlow backend.


In [2]:
counter = collections.Counter()
# file open
with codecs.open(INPUT_FILE, "r", encoding="utf-8") as fin:
    maxlen = 0
    for line in fin:
        # 行をtabでsplit
        _, sent = line.strip().split("\t")
        try:
            # NLTK(Natural Language Toolkit)で単語分割
            words = [x.lower() for x in nltk.word_tokenize(sent)]
        except LookupError:
            print("English tokenize does not downloaded. So download it.")
            nltk.download("punkt")
            words = [x.lower() for x in nltk.word_tokenize(sent)]
        maxlen = max(maxlen, len(words))
        for word in words:
            # 単語ごとでカウント
            counter[word] += 1
            # int型でdictionaryを初期化
            word2index = collections.defaultdict(int)
            # 出現回数が多い順に（文字, 回数）のタプル5000個に対して単語IDを定義
            for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
                word2index[word[0]] = wid + 1
            vocab_sz = len(word2index) + 1
            index2word = {v: k for k, v in word2index.items()}

In [3]:
xs, ys = [], []
with codecs.open(INPUT_FILE, "r", encoding="utf-8") as fin:
    for line in fin:
        label, sent = line.strip().split("\t")
        ys.append(int(label))
        words = [x.lower() for x in nltk.word_tokenize(sent)]
        wids = [word2index[word] for word in words]
        xs.append(wids)

X = pad_sequences(xs, maxlen = maxlen)
Y = np_utils.to_categorical(ys)

In [5]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.3, random_state = 42)

# ネットワークの作成
model = Sequential()
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length = maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(filters = NUM_FILTERS, kernel_size = NUM_WORDS, activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation = "softmax"))

# コンパイル
model.compile(optimizer = "adam", loss="categorical_crossentropy", metrics = ["accuracy"])
history = model.fit(Xtrain, Ytrain, batch_size = BATCH_SIZE,
                    epochs = NUM_EPOCHS,
                    # エラーになるのでコメントアウト
                    # callbacks = [TensorBoard(LOG_DIR)],
                    validation_data = (Xtest, Ytest))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 4960 samples, validate on 2126 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
