# Using AI for Sentiment Analysis

In [1]:
from keras.datasets import imdb

top_words = 10000
((x_train, y_train), 
 (x_test, y_test)) = imdb.load_data(num_words=top_words,
                                    seed=21)

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [2]:
print("Training examples: %i" % len(x_train))
print("Test examples: %i" % len(x_test))

Training examples: 25000
Test examples: 25000


In [3]:
import numpy as np
print(np.unique(y_train, return_counts=True))

(array([0, 1], dtype=int64), array([12500, 12500], dtype=int64))


In [4]:
word_to_id = {w:i+3 for w,i in imdb.get_word_index().items()}
id_to_word = {0:'<PAD>', 1:'<START>', 2:'<UNK>'}
id_to_word.update({i+3:w for w,i in imdb.get_word_index().items()})

def convert_to_text(sequence):
    return ' '.join([id_to_word[s] for s in sequence if s>=3])

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [5]:
 print(convert_to_text(x_train[8]))

this movie was like a bad train wreck as horrible as it was you still had to continue to watch my boyfriend and i rented it and wasted two hours of our day now don't get me wrong the acting is good just the movie as a whole just both of us there wasn't anything positive or good about this scenario after this movie i had to go rent something else that was a little lighter jennifer is as usual a very dramatic actress her character seems manic and not all there hannah though over played she does a wonderful job playing out the situation she is in more than once i found myself yelling at the tv telling her to fight back or to get violent all in all very violent movie not for the faint of heart


In [6]:
from keras.preprocessing.sequence import pad_sequences

max_pad = 200
x_train = pad_sequences(x_train, 
                        maxlen=max_pad)

x_test = pad_sequences(x_test, 
                       maxlen=max_pad)

In [7]:
print(x_train[0])

[  88    4 3310  406 6762    2    4  427 2140 1656 4042    2   11   41
    2  494   46 1954 4712  198   51   13  683 1193   10   10  198   66
   89    4  114  495 7303  197    4 1168 1656   61  492 1131    7 5388
   21   13  839   90  145    8  113   34 8253   27    2   19   15    7
    6 8870 3310   88 8222   92    2    8 5388    5 1037    2    2 2864
    2  449  168    6  404    2  112  207 1075    4  375 5986    7    4
  406 1522   13  124  903   97   90    2   21    2   48   32  148 3310
    2    2   93   61  492    2  305    7    2    4  893 8016   13  401
 5679   83   27  117 2687 5419   29  941 1889   90   21  808   14   46
  793    4 1526   84   37   28   34   96    7   49    2  114 1009 1054
   56   23   61 2301 1111    9    4  255    8  937   61  492   16 3953
  159   29 1131   13 2134 3872   81   41   32   14  832   56    8   35
  576 1301    5 5348 3134  255  335  170    8    2   72 1168 1656   57
   29    9    2    2 3310  415   11 5215   89 1047   10   10   81   24
  106 

In [8]:
from keras.models import Sequential 
from keras.layers import Bidirectional, Dense, Dropout 
from keras.layers import GlobalMaxPool1D, LSTM
from keras.layers.embeddings import Embedding 

embedding_vector_length = 32 
model = Sequential() 
model.add(Embedding(top_words, 
                    embedding_vector_length, 
                    input_length=max_pad)) 

model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(16, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy']) 

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 32)           320000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 128)          49664     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 371,745
Trainable params: 371,745
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
history = model.fit(x_train, y_train, 
                    validation_data=(x_test, y_test), 
                    epochs=3, batch_size=256)

Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [10]:
loss, metric = model.evaluate(x_test, y_test, verbose=0)
print("Test accuracy: %0.3f" % metric)

Test accuracy: 0.875
