In [1]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers.recurrent import LSTM
from keras.layers import Dense, Input, Flatten
from keras.callbacks import LambdaCallback
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout,GlobalMaxPooling1D
from keras.models import Model
from keras import backend as K    
K.set_image_dim_ordering('th') 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
corpus = pd.read_pickle('../data/MR.pkl')
corpus= corpus.sample(frac=1)
sentences, labels = list(corpus.sentence), list(corpus.label)
print len(sentences)

10662


In [3]:
corpus.head(5)

Unnamed: 0,label,sentence,split
3460,0,"it 's a boring movie about a boring man , made...",train
2953,0,a movie that tries to fuse the two 'woods' but...,train
808,0,"'the war of the roses , ' trailer trash style ...",train
1244,0,"shallow , noisy and pretentious .",train
9190,1,cut through the layers of soap opera emotion a...,train


In [4]:
#Increasing the value will increase sequence length in many sentences. Captures more words
TOP_N_WORDS = 5000

In [5]:
tokenizer = Tokenizer(nb_words=TOP_N_WORDS)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



Found 18758 unique tokens.


In [6]:
#Tokenizer.word_index shows the word and its index in the dictionary
#These indices are fed as a sequence
print (tokenizer.word_index)



In [7]:
#This will show how the first sentence has been converted to numeric sequence
print sequences[0]
#This will show the first sentence itself
print "Sentence: "+sentences[0]
#This will loop through every word of the first sentence and see which word is not added in the sequence.
print "Comment: Words not added from the first sentence along with their ID`s"
words = sentences[0].split(" ")
for i in words:
    if i in word_index:
        if word_index[i] not in sequences[0]:
            print i, word_index[i]

[8, 7, 2, 376, 17, 29, 2, 376, 166, 93, 882, 25, 2, 160, 34, 2, 386, 4, 132, 376]
Sentence: it 's a boring movie about a boring man , made watchable by a bravura performance from a consummate actor incapable of being boring .
Comment: Words not added from the first sentence along with their ID`s
bravura 5593
consummate 7192
incapable 5594


In [8]:
max_sequence_length = 0
min_sequence_length = -1
j = -1
for i in sequences:
    seq_len = len(i)
    
    if min_sequence_length == -1:
        min_sequence_length = seq_len
        
    if seq_len > max_sequence_length:
        max_sequence_length = seq_len
    
    if seq_len < min_sequence_length and min_sequence_length!=-1:
        min_sequence_length = seq_len
        j = i
        
print min_sequence_length
print max_sequence_length

0
52


In [9]:
data = pad_sequences(sequences, maxlen=max_sequence_length)
import numpy as np
data_labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', data_labels.shape)

('Shape of data tensor:', (10662, 52))
('Shape of label tensor:', (10662, 2))


In [10]:
data[0]
data_labels[0]

array([1., 0.])

In [11]:
import os
GLOVE_DIR = "/home/manoj/Downloads/glove.6B/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print "Loaded "+str(len(embeddings_index))+" word embeddings from GLOVE"

Loaded 400000 word embeddings from GLOVE


In [12]:
EMBEDDING_DIM = len(embeddings_index["the"])

In [13]:
#+1 for bias.
#len(word_index) because we have so many unique tokens after all the filtering.

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be random numbers.
        embedding_matrix[i] = embedding_vector

In [14]:
embedding_matrix.shape

(18759, 100)

In [15]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=True)

In [53]:
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

x = (LSTM(5, return_sequences=True))(embedded_sequences)
x = (LSTM(3, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)
x = Dense(2, activation="softmax")(x)

model = Model(sequence_input, x)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [54]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 52)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 52, 100)           1875900   
_________________________________________________________________
lstm_21 (LSTM)               (None, 52, 5)             2120      
_________________________________________________________________
lstm_22 (LSTM)               (None, 52, 3)             108       
_________________________________________________________________
global_max_pooling1d_11 (Glo (None, 3)                 0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 3)                 0         
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 8         
Total para

In [58]:
model.fit(data, data_labels,epochs=1, batch_size=128, validation_split= 0.1)

Train on 9595 samples, validate on 1067 samples
Epoch 1/1


<keras.callbacks.History at 0x7fdc0ff71710>