In [1]:
from __future__ import print_function
import os
import string
import numpy as np
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn import preprocessing
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.optimizers import SGD, RMSprop, Adam

Using Theano backend.


In [2]:
BASE_DIR = '.'
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/sentiment/'
MAX_SEQUENCE_LENGTH = 500
MAX_NB_WORDS = 5000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [3]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


##### run once to remove stopwords

In [29]:
import string
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
with open('reviews.txt','r') as inFile, open('reviews_new.txt','w') as outFile:
    for line in inFile:
        print(' '.join([word for word in line.lower().translate(None, string.punctuation).split() 
              if len(word) >=4 and word not in stopwords.words('english')]), file=outFile)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lc5319843\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
g = open(TEXT_DATA_DIR + 'reviews_new.txt','r') # What we know!
texts = list(map(lambda x:x[:-1],g.readlines()))
g.close()
        
print('Found %s texts.' % len(texts))

Found 25000 texts.


In [22]:
#import nltk
#nltk.download("stopwords")
#nltk.download("punkt")
#from nltk.corpus import stopwords
#new_texts = []

#def cleanupDoc(s):
#     stopset = set(stopwords.words('english'))
#     tokens = nltk.word_tokenize(s)
#     cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
#     return cleanup

#for text in texts:
#    new_texts.append(cleanupDoc(text))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lc5319843\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lc5319843\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [5]:
texts[0:1]

['bromwell high cartoon comedy time programs school life teachers years teaching profession lead believe bromwell high satire much closer reality teachers scramble survive financially insightful students right pathetic teachers pomp pettiness whole situation remind schools knew students episode student repeatedly tried burn school immediately recalled high classic line inspector sack teachers student welcome bromwell high expect many adults think bromwell high fetched pity']

In [5]:
g = open(TEXT_DATA_DIR + 'labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close() 

In [6]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)

In [7]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 71456 unique tokens.


In [8]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', data.shape)
data[0:10]

Shape of data tensor: (25000L, 500L)


array([[   0,    0,    0, ...,  163, 3769, 1914],
       [   0,    0,    0, ..., 2983,   25, 2975],
       [   0,    0,    0, ...,    2,  186,  243],
       ..., 
       [   0,    0,    0, ...,  435,   10, 2801],
       [   0,    0,    0, ...,  117,  346,   10],
       [   0,    0,    0, ...,    2,   30,  384]])

In [9]:
le = preprocessing.LabelEncoder()
labels = le.fit_transform(labels)

print('Shape of data tensor:', labels.shape)
labels[0:10]

Shape of data tensor: (25000L,)


array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0], dtype=int64)

In [10]:
labels = to_categorical(labels)

In [11]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [12]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [13]:
# prepare embedding matrix
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [14]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [18]:
# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 3, activation='relu')(embedded_sequences)
x = MaxPooling1D(3)(x)
#x = Conv1D(128, 5, activation='relu')(x)
#x = MaxPooling1D(5)(x)
#x = Conv1D(128, 5, activation='relu')(x)
#x = MaxPooling1D(35)(x)
x = Flatten()(x)
x = Dense(250, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)


model = Model(sequence_input, preds)

model.compile(Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=2, batch_size=128)

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x284a7b38>

In [23]:
model.save_weights('./weights/embed_conv.h5')

In [None]:
model.load_weights('./weights/embed_conv.h5')

In [21]:
test = ['i loved it']
seq_test = tokenizer.texts_to_sequences(test)
data_test = pad_sequences(seq_test, maxlen=MAX_SEQUENCE_LENGTH)

pred = model.predict(data_test)

In [22]:
pred

array([[ 0.35135821,  0.64864177]], dtype=float32)

In [97]:
sequence_input

input_14