In [1]:
import collections

from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import nltk
import numpy as np
import codecs
import os

np.random.seed(42)

INPUT_FILE = os.path.join(os.path.dirname("__file__"), "data/training.txt")
LOG_DIR = os.path.join(os.path.dirname("__file__"), "logs")
GLOVE_MODEL = os.path.join(os.path.dirname("__file_name"),
                           "data/glove.6B.100d.txt")
VOCAB_SIZE = 5000
EMBED_SIZE = 100
BATCH_SIZE = 64
NUM_EPOCHS = 10

Using TensorFlow backend.


In [2]:
print("reading data...")
counter = collections.Counter()
with codecs.open(INPUT_FILE, "r", encoding="utf-8") as fin:
    maxlen = 0
    for line in fin:
        _, sent = line.strip().split("\t")
        try:
            words = [x.lower() for x in nltk.word_tokenize(sent)]
        except LookupError:
            print("English tokenize does not downloaded. So down load it.")
            nltk.download("punkt")
            words = [x.lower() for x in nltk.word_tokenize(sent)]
        if len(words) > maxlen:
            maxlen = len(words)
        for word in words:
            counter[word] += 1

print("creating vocabulary...")
word2index = collections.defaultdict(int)
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
    word2index[word[0]] = wid + 1
vocab_sz = len(word2index) + 1
index2word = {v: k for k, v in word2index.items()}
index2word[0] = "_UNK_"

print("creating word sequences...")
ws, ys = [], []
with codecs.open(INPUT_FILE, "r", encoding="utf-8") as fin:
    for line in fin:
        label, sent = line.strip().split("\t")
        ys.append(int(label))
        words = [x.lower() for x in nltk.word_tokenize(sent)]
        wids = [word2index[word] for word in words]
        ws.append(wids)

W = pad_sequences(ws, maxlen=maxlen)
Y = np_utils.to_categorical(ys)

reading data...
creating vocabulary...
creating word sequences...


In [3]:
word2emb = collections.defaultdict(int)
with codecs.open(GLOVE_MODEL, "r", encoding="utf-8") as fglove:
    for line in fglove:
        cols = line.strip().split()
        word = cols[0]
        embedding = np.array(cols[1:], dtype="float32")
        word2emb[word] = embedding

X = np.zeros((W.shape[0], EMBED_SIZE))
for i in range(W.shape[0]):
    E = np.zeros((EMBED_SIZE, maxlen))
    words = [index2word[wid] for wid in W[i].tolist()]
    for j in range(maxlen):
        E[:, j] = word2emb[words[j]]
    X[i, :] = np.sum(E, axis=1)

In [4]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)

model = Sequential()
model.add(Dense(32, input_dim=EMBED_SIZE, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(2, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["accuracy"])
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(Xtest, Ytest))

score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))

Train on 4960 samples, validate on 2126 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test score: 0.102, accuracy: 0.969
