In [1]:
import numpy as np
import pandas as pd
import re

from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
np.random.seed(1)
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import text_to_word_sequence

import matplotlib.pyplot as plt

import gc

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from keras.preprocessing.text import text_to_word_sequence

stop_words = pd.read_csv('../data/stopwords.csv')['words'].values

def remove_stop_words(text):
    word_tokens = text_to_word_sequence(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(filtered_sentence)

print(stop_words)

['only' 'y' 'by' 'am' 'most' 'me' 'same' 'these' 'so' 'some' 'why' 'down'
 'had' 'd' 'at' 'having' 'those' 'has' 'few' 'theirs' "you've" 'more' 'i'
 'than' 'through' 'be' 'what' 'where' 'myself' 'which' 'doing' 'ours'
 'will' 'in' 'both' 'do' 'it' 'o' 'on' 'yours' 'once' 'ourselves' 'here'
 'about' "it's" 'my' 'for' 'her' 'then' 'after' "should've" 'from' 'each'
 'when' 'does' 'now' 'off' 'don' 'are' 'we' 'itself' 'should' 'his'
 'between' 'our' 'were' 'under' 'other' 'all' 'she' 'won' 'been' "you're"
 'how' 'did' 'yourself' 'they' 'into' 'there' 've' 'such' 't' 's' 'and'
 'over' 'to' 'just' 'was' 'being' 'because' 'if' 'who' 'further' 'the'
 'any' "that'll" 'themselves' 'as' 'again' "you'd" 'until' 'he' 'him'
 'this' 'or' 'of' 'below' 'an' "she's" 'weren' 'm' 'their' 'ma' 'up' 'll'
 'whom' 'hers' 'can' 'you' 'them' 'very' 'a' 'herself' 'before' 'too'
 'himself' 'during' 're' 'out' 'its' 'above' 'own' 'have' 'while'
 'yourselves' 'that' 'with' "you'll" 'is' 'your']


In [3]:
data = pd.read_csv('../data/twitter-airline-sentiment.csv')

In [4]:
data['text'] = list(map(remove_stop_words, data['text'].values))
data.head()

Unnamed: 0,text,pos,neg
0,said,0.0,0.0
1,plus youve added commercials experience tacky,1.0,0.0
2,didnt today must mean need take another trip,0.0,0.0
3,really aggressive blast obnoxious entertainmen...,0.0,1.0
4,really big bad thing,0.0,1.0


In [5]:
sentences = data['text'].values
corpus = [text_to_word_sequence(y) for y in sentences]

In [6]:
X_raw_train, X_raw_test, Y_train, Y_test = train_test_split(
    sentences,
    data[['pos', 'neg']].values,
    test_size=0.2, 
    random_state=3945
)

In [7]:
max_sentence_length = 35
vector_size = 300

In [8]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

# word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
word2vec = Word2Vec(sentences=corpus,
                    size=vector_size, 
                    window=10, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=4)
word2vec = word2vec.wv

gc.collect()



11

In [9]:
def sent2index(corpus):
    gc.collect()
    input_matrix = np.zeros((len(corpus),max_sentence_length))
    
    for i in range(len(corpus)):
        for t, token in enumerate(corpus[i]):
            if t >= max_sentence_length:
                break
            if token not in word2vec.vocab:
                continue
            input_matrix[i, t] = word2vec.vocab.get(token).index
    return input_matrix

In [10]:
X_train = sent2index(X_raw_train)
X_test = sent2index(X_raw_test)

In [11]:
gc.collect()
vocab_len = len(word2vec.vocab) + 1

emb_matrix = np.zeros((vocab_len, vector_size))

for word in word2vec.vocab:
    index = word2vec.vocab.get(word).index
    emb_matrix[index, :] = word2vec[word]
    
print(emb_matrix.shape)

(2590, 300)


In [12]:
gc.collect()
sentence_indices = Input(shape=(max_sentence_length,))
    
embedding_layer = Embedding(vocab_len, vector_size, trainable = False)
embedding_layer.build((None,))
embedding_layer.set_weights([emb_matrix])

embeddings = embedding_layer(sentence_indices)

In [13]:
X = LSTM(128, return_sequences=True)(embeddings)
X = Dropout(0.5)(X)
X = LSTM(128, return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(2, activation=None)(X)
X = Activation('softmax')(X)

model = Model(inputs=[sentence_indices], outputs=X)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 35)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 35, 300)           777000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 35, 128)           219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 35, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
__________

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

model.fit(
    X_train, 
    y=Y_train, 
    batch_size=32, 
    epochs=20, 
    verbose=1, 
    validation_data=(X_test, Y_test), 
#     callbacks=[earlystop],
)



Train on 11096 samples, validate on 2775 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2a407ac39e8>

In [15]:
mess = np.array([
    'this is slow',
    'this is exceptional service',
])
X_test_indices = sent2index(mess)
pred = model.predict(X_test_indices)

output = ''
for i ,m in enumerate(mess):
        output += ('{} {} {}\n'.format('POSITIVE:' if pred[i][0] > 0.5 else 'NEGATIVE:', m, pred[i]))
        
print(output)
        

NEGATIVE: this is slow [0.30715123 0.6928488 ]
POSITIVE: this is exceptional service [0.9860232  0.01397681]

