## Simple sentiment analysis with Keras and GloVe embeddings

In [1]:
import gensim
g = gensim.models.KeyedVectors.load_word2vec_format("data/glove.6B.300d.w2v.txt")



In [2]:
g.most_similar("hello")

[('goodbye', 0.5937638282775879),
 ('hey', 0.5576733946800232),
 ('!', 0.5420621633529663),
 ('dolly', 0.4840484857559204),
 ('muddah', 0.47528818249702454),
 ('yeah', 0.46858829259872437),
 ('wow', 0.4671405553817749),
 ('greeting', 0.4478132128715515),
 ('thank', 0.4400608241558075),
 ('kitty', 0.4307236075401306)]

In [3]:
import nltk
stopwords = set(nltk.corpus.stopwords.words('english'))

In [4]:
import numpy as np

In [5]:
with open("data/reviews.txt") as f:
    reviews = [ [y.lower() for y in line.strip().split() if y not in stopwords and len(y) > 2 and y in g ] for line in f ]

In [6]:
with open("data/labels.txt") as f:
    labels = np.array([ 1 if line.strip() == 'positive' else 0 for line in f ])

In [10]:
max_words = 100
embedding_dim = 300

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=50000)
texts = [" ".join(review) for review in reviews]
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=max_words)
vocab = tokenizer.word_index

In [12]:
glove_emb = np.zeros((len(vocab) + 1, embedding_dim))
for word, i in vocab.items():
    if word in g:
        glove_emb[i] = g[word]

In [21]:
from tensorflow import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

In [22]:
X_train, y_train, X_val, y_val = data[:20000], labels[:20000], data[20000:], labels[20000:]

In [29]:
model = Sequential()
model.add(Embedding(len(vocab) + 1, embedding_dim, weights=[glove_emb], input_length=max_words, trainable=False))
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=128)

In [31]:
model = Sequential()

model.add(Embedding(len(vocab) + 1, embedding_dim, weights=[glove_emb], input_length=max_words, trainable=False))

model.add(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3))
model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3))

model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [32]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, batch_size=128)

Train on 20000 samples, validate on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x19888dc6be0>

In [None]:
s = "cool movie !"

In [95]:
def test_on_texts(s):
    print(model.predict(pad_sequences(tokenizer.texts_to_sequences(s), maxlen=max_words)))

In [97]:
test_on_texts([
    'this was a great movie',
    'i really liked it',
    'terrible film, hated it',
    'the movie was so-so',
    'oh yeah, great movie, a new godfather',
    'i would watch it again and again and again'
    ])

[[0.7248762 ]
 [0.65714085]
 [0.27267292]
 [0.49111417]
 [0.74458504]
 [0.5209482 ]]
