In [1]:
import numpy as np
import pandas as pd

from gensim import corpora
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer

from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, SpatialDropout1D, InputLayer
from keras.layers import LSTM
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
np.random.seed(0)

In [3]:
#load data
train_df = pd.read_csv('../data/train.tsv', sep='\t', header=0)
test_df = pd.read_csv('../data/test.tsv', sep='\t', header=0)

In [4]:
raw_docs_train = train_df['Phrase'].values
raw_docs_test = test_df['Phrase'].values
sentiment_train = train_df['Sentiment'].values
num_labels = len(np.unique(sentiment_train))

In [5]:
#text pre-processing
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
stemmer = SnowballStemmer('english')

In [6]:
#nltk.download('punkt')

print("pre-processing train docs...")
processed_docs_train = []

for doc in raw_docs_train:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_train.append(stemmed)

pre-processing train docs...


In [7]:
print("pre-processing test docs...")

processed_docs_test = []

for doc in raw_docs_test:
    tokens = word_tokenize(doc)
    filtered = [word for word in tokens if word not in stop_words]
    stemmed = [stemmer.stem(word) for word in filtered]
    processed_docs_test.append(stemmed)

pre-processing test docs...


In [8]:
processed_docs_all = np.concatenate((processed_docs_train, processed_docs_test), axis=0)

In [9]:
dictionary = corpora.Dictionary(processed_docs_all)
dictionary_size = len(dictionary.keys())
print("dictionary size:", dictionary_size)

dictionary size: 13759


In [10]:
print("converting to token ids...")

word_id_train, word_id_len = [], []

for doc in processed_docs_train:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_train.append(word_ids)
    word_id_len.append(len(word_ids))

word_id_test, word_ids = [], []

for doc in processed_docs_test:
    word_ids = [dictionary.token2id[word] for word in doc]
    word_id_test.append(word_ids)
    word_id_len.append(len(word_ids))

seq_len = np.round((np.mean(word_id_len) + 2*np.std(word_id_len))).astype(int)

converting to token ids...


In [11]:
word_id_train = sequence.pad_sequences(np.array(word_id_train), maxlen=seq_len)
word_id_test = sequence.pad_sequences(np.array(word_id_test), maxlen=seq_len)
y_train_enc = np_utils.to_categorical(sentiment_train, num_labels)

In [14]:
#LSTM
print("fitting LSTM ...")

model = Sequential()
#model.add(InputLayer(input_shape=(seq_len,)))
model.add(Embedding(dictionary_size, 128))
#model.add(SpatialDropout1D(0.2))
model.add(LSTM(128))
#model.add(SpatialDropout1D(0.2))
model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])

model.summary()

fitting LSTM ...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         1761152   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 645       
_________________________________________________________________
activation_3 (Activation)    (None, 5)                 0         
Total params: 1,893,381
Trainable params: 1,893,381
Non-trainable params: 0
_________________________________________________________________


In [1]:
model.fit(word_id_train, y_train_enc, epochs=300, batch_size=32, validation_split=0.2,
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            min_delta=0.0001,
            patience=10,
            verbose=0,
            mode='min'
         )])

NameError: name 'model' is not defined