In [1]:
from keras.layers.core import Dense, Dropout, SpatialDropout1D
from keras.layers.convolutional import Conv1D
from keras.layers.embeddings import Embedding
from keras.layers.pooling import GlobalMaxPooling1D
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import collections
import matplotlib.pyplot as plt
import nltk
import numpy as np

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
np.random.seed(42)

In [3]:
INPUT_FILE = "data/umich-sentiment-train.txt"
VOCAB_SIZE = 5000
EMBED_SIZE = 100
NUM_FILTERS = 256
NUM_WORDS = 3
BATCH_SIZE = 64
NUM_EPOCHS = 20

In [4]:
counter = collections.Counter()
fin = open(INPUT_FILE, "r")
maxlen = 0
for line in fin:
    #_, sent = line.decode('utf8').strip().split("\t")
    _, sent = line.strip().split("\t")
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    if len(words) > maxlen:
        maxlen = len(words)
    for word in words:
        counter[word] += 1
fin.close()

In [5]:
word2index = collections.defaultdict(int)
for wid, word in enumerate(counter.most_common(VOCAB_SIZE)):
    word2index[word[0]] = wid + 1
vocab_sz = len(word2index) + 1
index2word = {v:k for k, v in word2index.items()}

In [6]:
xs, ys = [], []
fin = open(INPUT_FILE, "r")
for line in fin:
    label, sent = line.strip().split("\t")
    ys.append(int(label))
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    wids = [word2index[word] for word in words]
    xs.append(wids)
fin.close()
X = pad_sequences(xs, maxlen=maxlen)

In [7]:
Y = np_utils.to_categorical(ys)

In [8]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, random_state=42)

In [9]:
embedding_layer = Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen)

In [10]:
model = Sequential()
# model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen))

model.add(embedding_layer)

model.compile('rmsprop', 'mse')
output_array = model.predict(Xtrain)





In [11]:
output_array

array([[[ 0.03748308,  0.01016643,  0.04459797, ..., -0.04893805,
          0.00047791, -0.02263826],
        [ 0.03748308,  0.01016643,  0.04459797, ..., -0.04893805,
          0.00047791, -0.02263826],
        [ 0.03748308,  0.01016643,  0.04459797, ..., -0.04893805,
          0.00047791, -0.02263826],
        ...,
        [-0.00230595, -0.04273555,  0.02799037, ..., -0.01727477,
         -0.00382271,  0.0072687 ],
        [ 0.04931537, -0.01106163,  0.03705365, ..., -0.04269552,
          0.00466845,  0.04468817],
        [-0.04827228, -0.02407642, -0.04692221, ..., -0.026925  ,
          0.02328039, -0.00387763]],

       [[ 0.03748308,  0.01016643,  0.04459797, ..., -0.04893805,
          0.00047791, -0.02263826],
        [ 0.03748308,  0.01016643,  0.04459797, ..., -0.04893805,
          0.00047791, -0.02263826],
        [ 0.03748308,  0.01016643,  0.04459797, ..., -0.04893805,
          0.00047791, -0.02263826],
        ...,
        [-0.02031668, -0.01787167,  0.01170753, ..., -

In [12]:
embeddings = embedding_layer.get_weights()

In [13]:
embeddings

[array([[ 0.03748308,  0.01016643,  0.04459797, ..., -0.04893805,
          0.00047791, -0.02263826],
        [ 0.00261544,  0.04760228,  0.01978265, ...,  0.03155864,
          0.02775392,  0.00534248],
        [ 0.03266538,  0.0100345 ,  0.04009962, ..., -0.01641399,
         -0.02212536, -0.04621423],
        ...,
        [ 0.01281841, -0.01239831,  0.02396257, ...,  0.04984382,
         -0.00712018,  0.0439778 ],
        [ 0.03896398,  0.04419916,  0.03258631, ...,  0.02866254,
         -0.00583298, -0.0156497 ],
        [-0.04864553,  0.00828662, -0.00148288, ..., -0.03226625,
         -0.01160889,  0.00142257]], dtype=float32)]

In [None]:
model = Sequential()
model.add(Embedding(vocab_sz, EMBED_SIZE, input_length=maxlen))
model.add(SpatialDropout1D(0.2))
model.add(Conv1D(filters=NUM_FILTERS, kernel_size=NUM_WORDS, activation="relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation="softmax"))

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy",
              metrics=["acc"])
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(Xtest, Ytest))   

In [None]:
# plot loss function
plt.subplot(211)
plt.title("accuracy")
plt.plot(history.history["acc"], color="r", label="train")
plt.plot(history.history["val_acc"], color="b", label="validation")
plt.legend(loc="best")

In [None]:
plt.subplot(212)
plt.title("loss")
plt.plot(history.history["loss"], color="r", label="train")
plt.plot(history.history["val_loss"], color="b", label="validation")
plt.legend(loc="best")

In [None]:
plt.tight_layout()
plt.show()

In [None]:
# evaluate model
score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))