In [184]:
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model
from keras import backend as K    
K.set_image_dim_ordering('th') 

In [156]:
corpus = pd.read_pickle('../data/MR.pkl')
corpus= corpus.sample(frac=1)
sentences, labels = list(corpus.sentence), list(corpus.label)
print len(sentences)

10662


In [157]:
corpus.head(5)

Unnamed: 0,label,sentence,split
4504,0,johnnie to and wai ka fai are sure to find an ...,train
2294,0,a didactic and dull documentary glorifying sof...,train
4436,0,it 's a gag that 's worn a bit thin over the y...,train
1447,0,. a hokey piece of nonsense that tries too har...,train
9037,1,"depending upon your reaction to this movie , y...",train


In [158]:
#Increasing the value will increase sequence length in many sentences. Captures more words
TOP_N_WORDS = 5000

In [159]:
tokenizer = Tokenizer(nb_words=TOP_N_WORDS)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 18758 unique tokens.


In [160]:
#Tokenizer.word_index shows the word and its index in the dictionary
#These indices are fed as a sequence
print (tokenizer.word_index)



In [161]:
#This will show how the first sentence has been converted to numeric sequence
print sequences[0]
#This will show the first sentence itself
print "Sentence: "+sentences[0]
#This will loop through every word of the first sentence and see which word is not added in the sequence.
print "Comment: Words not added from the first sentence along with their ID`s"
words = sentences[0].split(" ")
for i in words:
    if i in word_index:
        if word_index[i] not in sequences[0]:
            print i, word_index[i]

[5, 3, 28, 402, 5, 215, 19, 3364, 139, 655, 186, 102, 444, 2628, 12, 1, 13, 7, 85, 21, 43, 5, 3908, 445]
Sentence: johnnie to and wai ka fai are sure to find an enthusiastic audience among american action adventure buffs , but the film 's interests may be too narrow to attract crossover viewers .
Comment: Words not added from the first sentence along with their ID`s
johnnie 10106
wai 10107
ka 10108
fai 10109
interests 5593
narrow 5594
crossover 5595


In [162]:
max_sequence_length = 0
min_sequence_length = -1
j = -1
for i in sequences:
    seq_len = len(i)
    
    if min_sequence_length == -1:
        min_sequence_length = seq_len
        
    if seq_len > max_sequence_length:
        max_sequence_length = seq_len
    
    if seq_len < min_sequence_length and min_sequence_length!=-1:
        min_sequence_length = seq_len
        j = i
        
print min_sequence_length
print max_sequence_length

0
52


In [163]:
data = pad_sequences(sequences, maxlen=max_sequence_length)
import numpy as np
data_labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', data_labels.shape)

('Shape of data tensor:', (10662, 52))
('Shape of label tensor:', (10662, 2))


In [164]:
data[0]
data_labels[0]

array([1., 0.])

In [165]:
import os
GLOVE_DIR = "/home/manoj/Downloads/glove.6B/"
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print "Loaded "+str(len(embeddings_index))+" word embeddings from GLOVE"

Loaded 400000 word embeddings from GLOVE


In [166]:
EMBEDDING_DIM = len(embeddings_index["the"])

In [167]:
#+1 for bias.
#len(word_index) because we have so many unique tokens after all the filtering.

embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be random numbers.
        embedding_matrix[i] = embedding_vector

In [168]:
embedding_matrix.shape

(18759, 100)

In [225]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=True)

In [234]:
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc'])

In [235]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_38 (InputLayer)        (None, 52)                0         
_________________________________________________________________
embedding_4 (Embedding)      (None, 52, 100)           1875900   
_________________________________________________________________
conv1d_75 (Conv1D)           (None, 48, 128)           64128     
_________________________________________________________________
max_pooling1d_69 (MaxPooling (None, 9, 128)            0         
_________________________________________________________________
conv1d_76 (Conv1D)           (None, 5, 128)            82048     
_________________________________________________________________
max_pooling1d_70 (MaxPooling (None, 1, 128)            0         
_________________________________________________________________
flatten_24 (Flatten)         (None, 128)               0         
__________

In [241]:
model.fit(data, data_labels,epochs=2, batch_size=128)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fed8c300050>

In [242]:
model.fit(data, data_labels,epochs=2, batch_size=128, validation_split= 0.1)

Train on 9595 samples, validate on 1067 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fed8c300310>