In [33]:
import numpy as np
import tensorflow as tf

###### Get tensorflow version and check if GPUs are working

In [34]:

print("The tensorflow version being used is: {}".format(tf.__version__))
print("Is there a GPU currently accessible?: {}".format(tf.test.is_gpu_available()))
print("Is the environment build using Cuda?: {}".format(tf.test.is_built_with_cuda()))

The tensorflow version being used is: 1.8.0
Is there a GPU currently accessible?: True
Is the environment build using Cuda?: True


In [35]:
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

##### Sample some reviews


In [36]:


reviews[:1000]

'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is tu

##### Preprocess the data


In [37]:


from string import punctuation
text = ''.join([i for i in reviews if i not in punctuation])
reviews = text.split('\n')

text = ''.join(reviews)
word = text.split()

In [38]:
text[:1000]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned into an insane  violent mo

In [39]:
word[:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

##### Now need to encode the words within a word embedding which will convert the words to intergers and pass it to the neural network for training
##### to do this the words need to be created into dictionaries and mapped to integers

In [40]:
from collections import Counter
counts = Counter(word)
vocab = sorted(counts,key=counts.get,reverse=True)
vocab_to_int = {word: i for i, word in enumerate(vocab,1)}
reviews_intergers = []
for each in reviews:
    reviews_intergers.append([vocab_to_int[word] for word in each.split()])

In [41]:
reviews_intergers[1:2]

[[63,
  4,
  3,
  125,
  36,
  47,
  7518,
  1398,
  16,
  3,
  4204,
  505,
  45,
  17,
  3,
  622,
  134,
  12,
  6,
  3,
  1279,
  457,
  4,
  1721,
  207,
  3,
  10852,
  7377,
  300,
  6,
  667,
  83,
  35,
  2118,
  1088,
  3004,
  34,
  1,
  900,
  54368,
  4,
  8,
  13,
  5106,
  464,
  8,
  2661,
  1721,
  1,
  221,
  57,
  17,
  58,
  794,
  1297,
  833,
  228,
  8,
  43,
  98,
  123,
  1470,
  59,
  147,
  38,
  1,
  963,
  142,
  29,
  667,
  123,
  1,
  13917,
  410,
  61,
  95,
  1775,
  306,
  756,
  5,
  3,
  819,
  10428,
  22,
  3,
  1726,
  635,
  8,
  13,
  128,
  73,
  21,
  233,
  102,
  17,
  49,
  50,
  617,
  34,
  683,
  85,
  30593,
  31078,
  683,
  374,
  3342,
  11672,
  2,
  16387,
  8023,
  51,
  29,
  108,
  3324]]

##### Now need to encode the labels for positive and negative reviews using the label dataset

In [42]:
labels = labels.split('\n')
labels = np.array([1 if each =='positive' else 0 for each in labels])

In [43]:
review_len = Counter([len(x) for x in reviews_intergers])
print("Zero-length reviews: {}".format(review_len[0]))
print("Maximum review length: {}".format(max(review_len)))

Zero-length reviews: 1
Maximum review length: 2514


##### need to remove 0 length review as to clean the dataset before training

In [44]:
non_zero_idx = [ii for ii, review in enumerate(reviews_intergers) if len(review) != 0]
len(non_zero_idx)

25000

In [45]:
reviews_intergers = [reviews_intergers[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [46]:
reviews_intergers

[[21696,
  308,
  6,
  3,
  1051,
  207,
  8,
  2138,
  32,
  1,
  171,
  57,
  15,
  49,
  81,
  5833,
  44,
  382,
  110,
  140,
  15,
  5221,
  60,
  154,
  9,
  1,
  5019,
  5882,
  475,
  71,
  5,
  260,
  12,
  21696,
  308,
  13,
  1981,
  6,
  74,
  2397,
  5,
  613,
  73,
  6,
  5221,
  1,
  25313,
  5,
  1987,
  10168,
  1,
  5815,
  1499,
  36,
  51,
  66,
  204,
  145,
  67,
  1200,
  5221,
  19926,
  1,
  37712,
  4,
  1,
  221,
  883,
  31,
  2988,
  71,
  4,
  1,
  5788,
  10,
  686,
  2,
  67,
  1499,
  54,
  10,
  216,
  1,
  384,
  9,
  62,
  3,
  1406,
  3708,
  783,
  5,
  3489,
  180,
  1,
  382,
  10,
  1214,
  13685,
  32,
  308,
  3,
  349,
  341,
  2913,
  10,
  143,
  127,
  5,
  7738,
  30,
  4,
  129,
  5221,
  1406,
  2329,
  5,
  21696,
  308,
  10,
  528,
  12,
  109,
  1448,
  4,
  60,
  543,
  102,
  12,
  21696,
  308,
  6,
  227,
  4169,
  48,
  3,
  2212,
  12,
  8,
  215,
  23],
 [63,
  4,
  3,
  125,
  36,
  47,
  7518,
  1398,
  16,
  3,
  4204,
 

##### now need to create array of features for the data passing into the network

In [47]:
sequence_length = 200
features = np.zeros((len(reviews_intergers), sequence_length), dtype=int)
for i, row in enumerate(reviews_intergers):
    features[i, -len(row):] = np.array(row)[:sequence_length]

In [48]:
features[:10,:100]

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 21696,   308,     6,
            3,  1051,   207,     8,  2138,    32,     1,   171,    57,
           15,    49,    81,  5833,    44,   382,   110,   140,    15,
         5221,    60,   154,     9,     1,  5019,  5882,   475,    71,
            5,   260,    12, 21696,   308,    13,  1981,     6,    74,
         2397],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     

##### Training and validation split

In [49]:
split_percentage = 0.8
split_idx = int(len(features)*split_percentage)
train_x, val_x = features[:split_idx], features[split_idx:]
train_y, val_y = labels[:split_idx], labels[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


##### Build the neural network

In [50]:
lstm_size = 256
lstm_layers = 1
batch_size = 500
learning_rate = 0.001

In [51]:
n_words = len(vocab_to_int) + 1
graph = tf.Graph()
with graph.as_default():
    inputs = tf.placeholder(tf.int32,[None,None])
    labels = tf.placeholder(tf.int32, [None,None])
    keep_prob = tf.placeholder(tf.float32)


##### Add the embedding layer as it is much more efficient than one-hot encoding and the embedding layer is used as a lookup table for the model when training and testing

In [52]:
embedding_size = 300

with graph.as_default():
    embedding = tf.Variable(tf.random_uniform((n_words, embedding_size), -1,1))
    embed = tf.nn.embedding_lookup(embedding,inputs)

##### Now create the neural network graph model with the LSTM cells

In [53]:
with graph.as_default():
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
    dropout = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=keep_prob)
    cell=tf.contrib.rnn.MultiRNNCell([dropout]*lstm_layers)
    initial_state = cell.zero_state(batch_size,tf.float32)
    
    outputs, final_state = tf.nn.dynamic_rnn(cell, embed,
                                             initial_state=initial_state)
    
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=tf.sigmoid)
    
    cost = tf.losses.mean_squared_error(labels, predictions)
    
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)


##### Validation rules

In [54]:
with graph.as_default():
    correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

##### Batching the data for training

In [55]:
def batches(x,y,batch_size=100):
    num_batches = len(x)//batch_size
    x,y = x[:num_batches*batch_size], y[:num_batches*batch_size]
    for i in range(0,len(x), batch_size):
        yield x[i:i+batch_size], y[i:i+batch_size]

In [56]:
epochs = 10

with graph.as_default():
    saver = tf.train.Saver()

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    iteration = 1
    for e in range(epochs):
        state = sess.run(initial_state)
        
        for i, (x, y) in enumerate(batches(train_x, train_y, batch_size), 1):
            feed = {inputs: x,
                    labels: y[:, None],
                    keep_prob: 0.5,
                    initial_state: state}
            loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
            
            if iteration%10==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%50==0:
                val_acc = []
                val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y in batches(val_x, val_y, batch_size):
                    feed = {inputs: x,
                            labels: y[:, None],
                            keep_prob: 1,
                            initial_state: val_state}
                    batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
    saver.save(sess, "checkpoints/sentiment.ckpt")

Epoch: 0/10 Iteration: 10 Train loss: 0.243
Epoch: 0/10 Iteration: 20 Train loss: 0.220
Epoch: 0/10 Iteration: 30 Train loss: 0.208
Epoch: 0/10 Iteration: 40 Train loss: 0.187
Epoch: 1/10 Iteration: 50 Train loss: 0.207
Val acc: 0.714
Epoch: 1/10 Iteration: 60 Train loss: 0.145
Epoch: 1/10 Iteration: 70 Train loss: 0.176
Epoch: 1/10 Iteration: 80 Train loss: 0.148
Epoch: 2/10 Iteration: 90 Train loss: 0.283
Epoch: 2/10 Iteration: 100 Train loss: 0.243
Val acc: 0.544
Epoch: 2/10 Iteration: 110 Train loss: 0.195
Epoch: 2/10 Iteration: 120 Train loss: 0.133
Epoch: 3/10 Iteration: 130 Train loss: 0.150
Epoch: 3/10 Iteration: 140 Train loss: 0.101
Epoch: 3/10 Iteration: 150 Train loss: 0.106
Val acc: 0.796
Epoch: 3/10 Iteration: 160 Train loss: 0.099
Epoch: 4/10 Iteration: 170 Train loss: 0.121
Epoch: 4/10 Iteration: 180 Train loss: 0.091
Epoch: 4/10 Iteration: 190 Train loss: 0.177
Epoch: 4/10 Iteration: 200 Train loss: 0.182
Val acc: 0.796
Epoch: 5/10 Iteration: 210 Train loss: 0.132
Epoc

In [57]:
test_acc = []
with tf.Session(graph=graph) as sess:
    saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    test_state = sess.run(cell.zero_state(batch_size, tf.float32))
    for ii, (x, y) in enumerate(batches(test_x, test_y, batch_size), 1):
        feed = {inputs: x,
                labels: y[:, None],
                keep_prob: 1,
                initial_state: test_state}
        batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed)
        test_acc.append(batch_acc)
    print("Test accuracy: {:.3f}".format(np.mean(test_acc)))

INFO:tensorflow:Restoring parameters from checkpoints\sentiment.ckpt
Test accuracy: 0.814


##### Model is bouncing around with training accuracy, and also having some trouble converging, may need some more work in both the amount of data, as well as some of the model architecture