In [0]:
'''
In this notebook I will try to infer more data from classical imdb review sentiment analysis.
I will try to find most positive and most negative keywords.
I will check the sample that yields negative or positive with same keywords.
I will create 2 models 
1 normal LSTM
1 bidirectional LSTM 
and try to understand if they have different results.
'''
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('INFO')

n_unique_words = 10000 
maxlen = 200
batch_size = 128

In [2]:
#load data
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=n_unique_words)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
y_train = np.array(y_train)
y_test = np.array(y_test)



Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [207]:
#create a Bidirectional model
modelBi = Sequential()
modelBi.add(Embedding(n_unique_words, 128, input_length=maxlen))
modelBi.add(Bidirectional(LSTM(64)))
modelBi.add(Dropout(0.5))
modelBi.add(Dense(1, activation='sigmoid'))
modelBi.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelBi.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 200, 128)          1280000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 1,378,945
Trainable params: 1,378,945
Non-trainable params: 0
_________________________________________________________________


In [4]:
#fit
modelBi.fit(x_train, y_train,batch_size=batch_size,epochs=4,validation_data=[x_test, y_test])




Train on 25000 samples, validate on 25000 samples
Epoch 1/4





Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f360d70d630>

In [208]:
#create LSTM modeL
modelLSTM = Sequential()
modelLSTM.add(Embedding(n_unique_words, 128, input_length=maxlen))
modelLSTM.add(LSTM(64))
modelLSTM.add(Dropout(0.5))
modelLSTM.add(Dense(1, activation='sigmoid'))

modelLSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
modelLSTM.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 200, 128)          1280000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 1,329,473
Trainable params: 1,329,473
Non-trainable params: 0
_________________________________________________________________


In [8]:
modelLSTM.fit(x_train, y_train,batch_size=batch_size,epochs=4,validation_data=[x_test, y_test])

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f35b4405780>

In [155]:
import nltk
#nltk.download()
nltk.download('punkt')
from nltk import word_tokenize
from keras.preprocessing import sequence

#creata reverse index
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
word2index = imdb.get_word_index()

#convert a sequence of indexes into keyword list. used to convert review from int sequence to text
def seq_to_word(xi):
  review = [reverse_word_index.get(i-3, "?") for i in xi]
  return " ".join(review)

#single index to word
def index_to_word(xi):
  review = reverse_word_index.get(xi-3, "?")
  return review

def predict(model,index,max_review_length):
 
  test=[index for _ in range(max_review_length)]
  test=sequence.pad_sequences([test],maxlen=max_review_length) 
  return model.predict(test)

#fill a review with a single keywords and calculate point.
#of course this is not an exact solution but a good approximation
def get_best_scores(model,max_review_length):
  index_with_prediction = []
  for index in range(4,10000):   
    prediction = predict(model,index,max_review_length)
    index_with_prediction.append((index,prediction))

  sorted_by_second = sorted(index_with_prediction, key=lambda tup: tup[1])
  return sorted_by_second

#find the review that include a keyword
def get_review_indexes(word_index,xs,ys):
   
  pos_indexes = []
  neg_indexes = []
  sum = 0
  for id,x in enumerate(xs) :
    if word_index in x:
      if ys[id] == 0:
        pos_indexes.append(id)
      else:  
        neg_indexes.append(id)
      sum += ys[id]
  ratio = (sum * 100 ) / (len(pos_indexes)+len(neg_indexes) ) 
  print(sum ," positive samples in ",(len(pos_indexes)+len(neg_indexes) ) , " reference ,  ratio % : ",int(ratio))   
  return pos_indexes,neg_indexes ,sum ,ratio

#of all reviews a keyword exists,how many of them are positive
def get_review_ratio(word_index,xs,ys):
   
  all_references = 0
  positive_references = 0
  for id,x in enumerate(xs) :
    if word_index in x:
      all_references += 1
      positive_references += ys[id]
  ratio = (positive_references * 100 ) / all_references 
  return  all_references,positive_references,ratio
   

import math  
def print_seperated(s,width):
  [print(s[i*width:(i+1)*width]) for i in range( math.ceil(len(s)  / width ))]

def print_sep(s,width):
  current_width = 0
  currents = []
  splits = s.split(" ")
  for split in splits:
    current_width += len(split)
    if current_width > width:
      print( " ".join(currents) )
      currents = []
      current_width = 0
    currents.append(split)
  print( " ".join(currents) )  

#get positive and negative samples of a keyword
def get_index_samples(index,sample_size):
  pos_indexes,neg_indexes,sum,ratio = get_review_indexes(index ,x_train,y_train)
  for y in pos_indexes[0:sample_size]:
    print("positive sample------------------------")
    print( index_to_word2(index)) 
    print( print_sep(seq_to_word2(x_train[y]) ,100))
    print(  y_train[y] )  
  for y in neg_indexes[0:sample_size]:
    print("negative sample------------------------")
    print( "keyword :",index_to_word2(index)) 
    print( "review   :" )   
    print_sep(seq_to_word2(x_train[y]) ,100)
    print( "label   :" ,y_train[y] )    

   

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


29 84 34.523809523809526


In [0]:
#get best score for Bidirectional LSTM
best_scoresBi = get_best_scores(modelBi,maxlen)


In [0]:
#get best score for  LSTM
best_scoresLSTM = get_best_scores(modelLSTM,maxlen)

In [0]:
import pandas as pd

def review_to_df(scores):
  stats = []
  for xi in scores: 
    review_score =  get_review_ratio(xi[0],x_train,y_train)
    stats.append([index_to_word2(xi[0]),xi[1][0][0],review_score[0],review_score[1],int(review_score[2])])
  stats_arr = np.array(stats)
  df = pd.DataFrame()
  df["Word"] = stats_arr[:,0:1].flatten()
  df["Score on Model"] = stats_arr[:,1:2].flatten()
  df["Total Reference"] = stats_arr[:,2:3].flatten()  
  df["Positive Reference"] = stats_arr[:,3:4].flatten()
  df["Ratio"] = stats_arr[:,4:5].flatten()

  return df

In [142]:
#Print keywords yielding to negative
review_to_df(best_scoresLSTM[0:20])

Unnamed: 0,Word,Score on Model,Total Reference,Positive Reference,Ratio
0,destroy,2.102829e-05,129,51,39
1,terrible,2.2661348e-05,1064,150,14
2,howling,2.3461156e-05,24,5,20
3,poorly,2.4582972e-05,472,48,10
4,clooney,2.6178206e-05,32,16,50
5,werewolves,2.6190368e-05,23,2,8
6,acknowledge,2.688756e-05,26,11,42
7,pitiful,2.8412234e-05,56,8,14
8,unbearable,2.8801885e-05,84,15,17
9,precious,3.0029294e-05,84,29,34


In [202]:
#Print keywords yielding to negative positive
review_to_df(best_scoresLSTM[-1:-20:-1])

Unnamed: 0,Word,Score on Model,Total Reference,Positive Reference,Ratio
0,stunning,0.9999895,280,212,75
1,stack,0.9999876,31,26,83
2,timeless,0.99998677,92,81,88
3,perfect,0.9999865,1029,825,80
4,courage,0.99998415,105,80,76
5,cost,0.999984,165,72,43
6,worlds,0.99998355,73,58,79
7,refreshing,0.9999832,163,147,90
8,antwone,0.99998295,22,22,100
9,wonderful,0.9999825,1166,955,81


In [144]:
review_to_df(best_scoresBi[0:20])

Unnamed: 0,Word,Score on Model,Total Reference,Positive Reference,Ratio
0,worst,3.874825e-06,1807,153,8
1,poorly,3.971354e-06,472,48,10
2,puppets,4.8796437e-06,37,8,21
3,button,5.0452336e-06,76,18,23
4,costs,5.5173787e-06,206,28,13
5,wasting,5.5777646e-06,119,7,5
6,tiresome,5.5951327e-06,71,12,16
7,precious,5.8419682e-06,84,29,34
8,pathetic,6.196008e-06,339,44,12
9,embarrassment,6.4617125e-06,70,7,10


In [201]:

review_to_df(best_scoresBi[-1:-20:-1])

Unnamed: 0,Word,Score on Model,Total Reference,Positive Reference,Ratio
0,winchester,0.9999981,23,22,95
1,eerie,0.99999785,100,73,73
2,supports,0.9999976,23,16,69
3,suffer,0.9999975,117,56,47
4,adds,0.99999726,230,163,70
5,aiello,0.99999714,17,16,94
6,timeless,0.99999714,92,81,88
7,wonderful,0.9999969,1166,955,81
8,searched,0.9999968,22,18,81
9,spoilers,0.99999666,211,84,39


In [157]:
#print negative keywords with positive and negative samples
#1st is worst , but sometimes it is labeled negative and sometimes positive
#below printing 5 samples of each keyword
for index in best_scoresBi[0:2]:
  print("index :",index[0]," score : ",index[1][0][0])
  get_index_samples(index[0],5)
  print("*************************************************************************")

index : 249  score :  3.874825e-06
153  positive samples in  1807  reference ,  ratio % :  8
positive sample------------------------
worst
? ? ? ? ? ? ? ? ? ? ? ? big hair big boobs bad music and a giant safety pin these are the words to best describe this terrible movie
i love cheesy horror movies and i've seen hundreds but this had got to be on of the worst ever made the plot is paper thin and
ridiculous the acting is an abomination the script is completely laughable the best is the end showdown with the cop and how he
worked out who the killer is it's just so damn terribly written the clothes are sickening and funny in equal ? the hair is big lots
of boobs ? men wear those cut ? shirts that show off their ? sickening that men actually wore them and the music is just ? trash
that plays over and over again in almost every scene there is trashy music boobs and ? taking away bodies and the gym still
doesn't close for ? all joking aside this is a truly bad film whose only charm is to loo

In [204]:
#print negative keywords with positive and negative samples
#1st is worst , but sometimes it is labeled negative and sometimes positive
#below printing 5 samples of each keyword
for index in best_scoresBi[-1:-3:-1]:
  print("index :",index[0]," score : ",index[1][0][0])
  get_index_samples(index[0],5)
  print("*************************************************************************")

index : 5829  score :  0.9999981
22  positive samples in  23  reference ,  ratio % :  95
positive sample------------------------
winchester
? in far he is shot off a raft with such violence it looks so convincing that you ? and of course when he is dragged through
the fire in man well you find yourself looking for the burn marks what an actor not to mention the moment in winchester when he
is beaten up early in the hotel room also as well as anybody ever did it br br but that was ? territory look at gary cooper
fighting with jack lord in man of the west as painful as any fight scene ever recorded cooper while not being quite as convincing as
stewart nevertheless is somehow his equal in looking exhausted at the end of the fight in short nobody but nobody but nobody ever
showed the human being in ? as well as mann br br what a great great director br br see every western he ever made they are his real ?
even if all are ? but so what when he gets roaring with his great scenes they are as 