In [1]:
import numpy as np
import pandas as pd
import re

from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
from keras.utils import np_utils
from keras.callbacks import EarlyStopping
from nltk.corpus import stopwords
np.random.seed(1)
from sklearn.model_selection import train_test_split

from keras.preprocessing.text import text_to_word_sequence

import matplotlib.pyplot as plt

import gc

%matplotlib inline

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
 
stop_words = set(stopwords.words('english')) 

def remove_stop_words(text):
    word_tokens = word_tokenize(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(filtered_sentence)

[nltk_data] Downloading package stopwords to C:\Users\Mike del
[nltk_data]     Castillo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Mike del
[nltk_data]     Castillo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
data = pd.read_csv('data/twitter-airline-sentiment.csv')

In [4]:
data['text'] = list(map(remove_stop_words, data['text'].values))
data.head()

Unnamed: 0,text,pos,neg
0,said,0.0,0.0
1,plus youve added commercials experience tacky,1.0,0.0
2,didnt today must mean need take another trip,0.0,0.0
3,really aggressive blast obnoxious entertainmen...,0.0,1.0
4,really big bad thing,0.0,1.0


In [5]:
sentences = data['text'].values
corpus = [text_to_word_sequence(y) for y in sentences]

In [6]:
X_raw_train, X_raw_test, Y_train, Y_test = train_test_split(
    sentences,
    data[['pos', 'neg']].values,
    random_state=69, 
    test_size=0.1, shuffle=True
)

In [7]:
max_sentence_length = 35
vector_size = 300

In [8]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

# word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
word2vec = Word2Vec(sentences=corpus,
                    size=vector_size, 
                    window=10, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=4)
word2vec = word2vec.wv

gc.collect()



11

In [9]:
def sent2index(corpus):
    gc.collect()
    input_matrix = np.zeros((len(corpus),max_sentence_length))
    
    for i in range(len(corpus)):
        for t, token in enumerate(corpus[i]):
            if t >= max_sentence_length:
                break
            if token not in word2vec.vocab:
                continue
            input_matrix[i, t] = word2vec.vocab.get(token).index
    return input_matrix

In [10]:
X_train = sent2index(X_raw_train)
X_test = sent2index(X_raw_test)

In [11]:
gc.collect()
vocab_len = len(word2vec.vocab) + 1

emb_matrix = np.zeros((vocab_len, vector_size))

for word in word2vec.vocab:
    index = word2vec.vocab.get(word).index
    emb_matrix[index, :] = word2vec[word]
    
print(emb_matrix.shape)

(2585, 300)


In [12]:
gc.collect()
sentence_indices = Input(shape=(max_sentence_length,))
    
embedding_layer = Embedding(vocab_len, vector_size, trainable = False)
embedding_layer.build((None,))
embedding_layer.set_weights([emb_matrix])

embeddings = embedding_layer(sentence_indices)

In [13]:
X = LSTM(128, return_sequences=True)(embeddings)
X = Dropout(0.5)(X)
X = LSTM(128, return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(2, activation=None)(X)
X = Activation('softmax')(X)

model = Model(inputs=[sentence_indices], outputs=X)

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 35)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 35, 300)           775500    
_________________________________________________________________
lstm_1 (LSTM)                (None, 35, 128)           219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 35, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
__________

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')

model.fit(
    X_train, 
    y=Y_train, 
    batch_size=16, 
    epochs=20, 
    verbose=1, 
    validation_data=(X_test, Y_test), 
#     callbacks=[earlystop],
)



Train on 12483 samples, validate on 1388 samples
Epoch 1/20


InternalError: Blas GEMM launch failed : a.shape=(16, 128), b.shape=(128, 128), m=16, n=128, k=128
	 [[{{node lstm_1/while/MatMul_7}} = MatMul[T=DT_FLOAT, _class=["loc:@training/Adam/gradients/AddN_13"], transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/device:GPU:0"](lstm_1/while/Switch_3:1, lstm_1/while/MatMul_7/Enter)]]
	 [[{{node loss/mul/_125}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_3365_loss/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

In [None]:
mess = np.array([
    'this is slow',
    'this is exceptional service',
])
X_test_indices = sent2index(mess)
pred = model.predict(X_test_indices)

output = ''
for i ,m in enumerate(mess):
        output += ('{} {} {}\n'.format('POSITIVE:' if pred[i][0] > 0.5 else 'NEGATIVE:', m, pred[i]))
        
print(output)
        