# Sentiment Analysis in TensorFlow

In [1]:
import tensorflow as tf
from keras.datasets import imdb
from keras.preprocessing import sequence

Using Theano backend.
Using cuDNN version 6021 on context None
Mapped name None to device cuda: GeForce GTX 1050 (0000:01:00.0)


In [2]:
import numpy as np
from random import randint
import datetime

In [3]:
max_features = 10000
maxlen = 250
batch_size = 32

In [4]:
print('Loading data...')
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)

Loading data...


## Preprocessing dataset

In [5]:
print(len(input_train), 'train sequences')
print(len(input_test), 'test sequences')

print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)

print('input_train shape:', input_train.shape)
print('input_test shape:', input_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

25000 train sequences
25000 test sequences
Pad sequences (samples x time)
input_train shape: (25000, 250)
input_test shape: (25000, 250)
y_train (25000,)
y_test (25000,)


In [6]:
INDEX_FROM = 3
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}

In [7]:
id_example = 24
print("TRAIN ")
print("Integers:")
print("-"*10)
print(input_train[id_example])
print("\n")
print("Sentences:")
print("-"*10)
print(' '.join(id_to_word[id] for id in input_train[id_example]))

print("\n")
print("TEST ")
print("Sentence train :", id_example, " value", y_train[id_example])


TRAIN 
Integers:
----------
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    1    4  204 7610   20   16   93   11 9075   19    2
 4390    6   55   52   22  849 4227  119    7 5259  961  178    6 1018
  221   20 1184    2    2   29    7  265   16  53

In [8]:
id_example = 1
print("TRAIN ")
print("Integers:")
print("-"*10)
print(input_train[id_example])
print("\n")
print("Sentences:")
print("-"*10)
print(' '.join(id_to_word[id] for id in input_train[id_example]))

print("\n")
print("TEST ")
print("Sentence train :", id_example, " value", y_train[id_example])


TRAIN 
Integers:
----------
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    1  194 1153  194 8255   78  228    5    6
 1463 4369 5012  134   26    4  715    8  118 1634   14  394   20   13
  119  954  189  102    5  207  110 3103   21   14   69  188    8   30
   23    7    4  249  126   93    4  114    9 2300 1523    5  647    4
  116    9   35 8163    4  229    9  340 1322    4  118    9    4  130
 4901   19    4 1002    5   89   29  952   46   37    4  455    9   45
   43   38 1543 1905  398    4 1649   26 6853    5  163   11 3215    2
    4 1153    9  194  775    7 8255    2  349 2637  148  605    2 8003
   15  123  125   68    2 6853   15  349  165 4362   98    5    4  228
    9   43    2 1157   15  299  120    5  120  17

In [9]:
print(len(y_train))
print(y_train[1])

25000
0


In [10]:
#Convert y_train, y_test a multiclass
def convert_to_multiclass(data):
    y = []
    for i in range(len(data)):
        if data[i] == 1:
            y.append([1,0])
        else:
            y.append([0,1])
    y = np.array(y)
    return y

y_train_mc = convert_to_multiclass(y_train)
y_test_mc = convert_to_multiclass(y_test)
            
print(y_train_mc.shape)
print(y_test_mc.shape)

(25000, 2)
(25000, 2)


In [11]:
def next_batch(batch_size, data, labels):
    idx = np.arange(0, len(data))
    np.random.shuffle(idx)
    idx = idx[:batch_size]
    data_shuffle = [data[i] for i in idx]
    labels_shuffle = [labels[i] for i in idx]
    
    return np.asarray(data_shuffle), np.asarray(labels_shuffle)

input_train_batch, y_train_batch = next_batch(32, input_train, y_train_mc)
print(input_train_batch.shape)
print(y_train_batch.shape)

(32, 250)
(32, 2)


## Stack LSTM Model

In [12]:
batchSize = 32
lstmUnits = 64
numClasses = 2
iterations = 1500
num_layers = 3

In [13]:
# placeholders for input and labels

tf.reset_default_graph()
labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxlen])

In [14]:
# embedding layer
num_embeddings = 300

embedding_matrix = tf.Variable(tf.random_uniform([max_features, num_embeddings], 
                                                 -1.0, 
                                                 1.0))

embedding_layer = tf.nn.embedding_lookup(embedding_matrix, input_data)
print(embedding_layer.get_shape())

(32, 250, 300)


In [15]:
def make_rnn_LSTMcell(lstm_size):
    cell = tf.contrib.rnn.LSTMCell(lstm_size, state_is_tuple=True)
    cell = tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob = 0.75)
    print(cell.state_size)
    return cell 

In [16]:
# LSTM layer and dropout layer
lstmCell = tf.contrib.rnn.MultiRNNCell(cells=[make_rnn_LSTMcell(lstmUnits) for _ in range(num_layers)], 
                                       state_is_tuple=True)
print(lstmCell.state_size)

'''
state_placeholder = tf.placeholder(tf.float32, [num_layers, 2, batchSize, lstmUnits])
l_unstack = tf.unstack(state_placeholder, axis=0)
rnn_tuple_state = tuple([tf.nn.rnn_cell.LSTMStateTuple(
    l_unstack[idx][0], l_unstack[idx][1])
    for idx in range(num_layers)])
print(rnn_tuple_state)
'''

LSTMStateTuple(c=64, h=64)
LSTMStateTuple(c=64, h=64)
LSTMStateTuple(c=64, h=64)
(LSTMStateTuple(c=64, h=64), LSTMStateTuple(c=64, h=64), LSTMStateTuple(c=64, h=64))
(LSTMStateTuple(c=<tf.Tensor 'strided_slice:0' shape=(32, 64) dtype=float32>, h=<tf.Tensor 'strided_slice_1:0' shape=(32, 64) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'strided_slice_2:0' shape=(32, 64) dtype=float32>, h=<tf.Tensor 'strided_slice_3:0' shape=(32, 64) dtype=float32>), LSTMStateTuple(c=<tf.Tensor 'strided_slice_4:0' shape=(32, 64) dtype=float32>, h=<tf.Tensor 'strided_slice_5:0' shape=(32, 64) dtype=float32>))


In [17]:
zero_state = lstmCell.zero_state(batchSize, dtype=tf.float32)

In [18]:
encode_lstm, _ = tf.nn.dynamic_rnn(lstmCell, 
                                   embedding_layer,
                                   #initial_state = rnn_tuple_state,
                                   initial_state = zero_state,
                                   dtype=tf.float32)


In [19]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(encode_lstm, [1,0,2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

In [20]:
# metrics for correct prediction and accuracy
correctPred = tf.equal(tf.argmax(prediction, 1), tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

In [21]:
# standard cross entropy loss with softmax layer
# adam optmizer

loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, 
                                                              labels = labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

## Train

In [22]:
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "/home/marchelo/MarcheloBragagnini/100DaysOfMLCode/tensorboard" + "/" + datetime.datetime.now().strftime("%Y%m%s-%H%M%S") + "/"

In [23]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()

sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter(logdir, graph = sess.graph)

istate = sess.run(zero_state)

for i in range(iterations):
    #next batch
    input_batch, label_batch = next_batch(batch_size, input_train, y_train_mc)
    #print("iterations")
    #print(i)
    feed_dict = {input_data:input_batch, labels: label_batch}
    for ke,va in enumerate(zero_state):
        feed_dict[va] = istate[ke]
        
    _, ostate, l, acc = sess.run([optimizer, zero_state, loss, accuracy], 
                        feed_dict = feed_dict)
    
    # write summary to Tensorboard
    if (i%50 == 0):        
        print("Iteration %d: loss %f , acc: %f"%(i, l, acc))
        summary = sess.run(merged, feed_dict=feed_dict)
        writer.add_summary(summary, i)
    
    if(i % 500 == 0 and i != 0):
        save_path = saver.save(sess, 
                               "./weights_models/pretrained_lstm_SA.ckpt",
                                global_step = i)
        print("saved to %s" % save_path)
        
    istate = ostate
        

Iteration 0: loss 0.764886 , acc: 0.406250
Iteration 50: loss 0.638540 , acc: 0.718750
Iteration 100: loss 0.567478 , acc: 0.750000
Iteration 150: loss 0.519135 , acc: 0.812500
Iteration 200: loss 0.547022 , acc: 0.687500
Iteration 250: loss 0.537737 , acc: 0.781250
Iteration 300: loss 0.608097 , acc: 0.625000
Iteration 350: loss 0.271229 , acc: 0.906250
Iteration 400: loss 0.371059 , acc: 0.781250
Iteration 450: loss 0.573897 , acc: 0.812500
Iteration 500: loss 0.287371 , acc: 0.875000
saved to ./weights_models/pretrained_lstm_SA.ckpt-500
Iteration 550: loss 0.494807 , acc: 0.781250
Iteration 600: loss 0.305760 , acc: 0.875000
Iteration 650: loss 0.167991 , acc: 0.937500
Iteration 700: loss 0.247390 , acc: 0.937500
Iteration 750: loss 0.347855 , acc: 0.875000
Iteration 800: loss 0.210029 , acc: 0.937500
Iteration 850: loss 0.520561 , acc: 0.812500
Iteration 900: loss 0.299811 , acc: 0.875000
Iteration 950: loss 0.257366 , acc: 0.875000
Iteration 1000: loss 0.271144 , acc: 0.875000
sav

#### References
* https://www.oreilly.com/learning/perform-sentiment-analysis-with-lstms-using-tensorflow
* https://web.stanford.edu/class/cs20si/2017/lectures/notes_04.pdf
* https://www.samyzaf.com/ML/imdb/imdb.html
* https://keras.io/datasets/
* https://stackoverflow.com/questions/48372994/multirnn-and-static-rnn-error-dimensions-must-be-equal-but-are-256-and-129?rq=1