In [0]:
'''
I want to try keras sample on imdb for Conv + LSTM
I will train 1 LSTM with Conv Layer.
Then I will train another normal  LSTM
and find reviews where they predict different values
'''

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.datasets import imdb

# Embedding
max_features = 20000
maxlen = 100
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 2

In [3]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)
y_train shape: (25000,)
y_test shape: (25000,)


In [5]:
modelConv = Sequential()
modelConv.add(Embedding(max_features, embedding_size, input_length=maxlen))
modelConv.add(Dropout(0.25))
modelConv.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
modelConv.add(MaxPooling1D(pool_size=pool_size))
modelConv.add(LSTM(lstm_output_size))
modelConv.add(Dense(1))
modelConv.add(Activation('sigmoid'))

modelConv.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

modelConv.summary()





Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 96, 64)            41024     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 70)                37800     
____________________________________

In [6]:
print('Train...')
modelConv.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
score, acc = modelConv.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...



Train on 25000 samples, validate on 25000 samples
Epoch 1/2





Epoch 2/2
Test score: 0.344353053689003
Test accuracy: 0.8570399937152863


In [7]:
modelLSTM = Sequential()
modelLSTM.add(Embedding(max_features, embedding_size, input_length=maxlen))
modelLSTM.add(LSTM(lstm_output_size))
modelLSTM.add(Dense(1))
modelLSTM.add(Activation('sigmoid'))

modelLSTM.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

modelLSTM.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 70)                55720     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 71        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total params: 2,615,791
Trainable params: 2,615,791
Non-trainable params: 0
_________________________________________________________________


In [8]:
print('Train...')
modelLSTM.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
score, acc = modelLSTM.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2
Test score: 0.40341880458891394
Test accuracy: 0.8452799948692322


In [0]:
import numpy as np
def to_binary(x):
  if x > 0.5 :
    return 1
  return 0  
vfunc = np.vectorize(to_binary)
#predict for both models
y_1_predictions = modelConv.predict(x_test)
y_2_predictions = modelLSTM.predict(x_test)

In [0]:
#find the indexes that has different predictions
y_1_predictions_binarized = vfunc(y_1_predictions)
y_2_predictions_binarized = vfunc(y_2_predictions)
comparison = y_1_predictions_binarized == y_2_predictions_binarized
notequals = [ index for index,k in enumerate(comparison) if not k]

In [12]:
#create reverse index
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
word2index = imdb.get_word_index()

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [0]:
def seq_to_word(xi):
  review = [reverse_word_index.get(i-3, "?") for i in xi]
  return " ".join(review)

def print_sep(s,width):
  current_width = 0
  currents = []
  splits = s.split(" ")
  for split in splits:
    current_width += len(split)
    if current_width > width:
      print( " ".join(currents) )
      currents = []
      current_width = 0
    currents.append(split)
  print( " ".join(currents) )  

def dump_info(index):
  print("Review")
  print_sep(seq_to_word(x_train[index]),100)
  print("prediction Conv",y_1_predictions[index])
  print("prediction LSTM",y_2_predictions[index])
  print("real label ",y_test[index])

In [15]:
#dump 1st 20 different predictions
for index in notequals[0:20]:
  print("******************************")
  dump_info(index)

******************************
Review
like angus i got slightly annoyed with the pretext of hanging stories on more stories but also like angus i ? this once i
saw the ? picture ' forget the box office pastiche of braveheart and its like you might even ? the justly famous ? of the wicker
man to see a film that is true to scotland this one is probably unique if you maybe ? on it deeply enough you might even re
evaluate the power of storytelling and the age old question of whether there are some truths that cannot be told but only
experienced
prediction Conv [0.22966748]
prediction LSTM [0.69295824]
real label  0
******************************
Review
with nana is ? ? cameo provides the lighter moments and surely he's been pulled in to get the required star value though
his role was not really required he's done it well overall shakti is a far superior film than most churned out these days and
the pr release hype is sure to get it a good opening shakti is sure to get the critics and audie

NameError: ignored