In [1]:
import numpy as np
import tensorflow as tf
wordsList = np.load('wordsList.npy')
wordsList = wordsList.tolist() #Originally loaded as numpy array
wordsList = [word.decode('UTF-8') for word in wordsList] #Encode words as UTF-8
wordVectors = np.load('wordVectors.npy')

In [2]:
ids = np.load('idsMatrix.npy')

In [3]:
def turn_sentence_to_indices(sentence):
    indices = np.zeros(250, dtype='int32')  # length 250 because the Tensorflow model needs a fixed dimension
    for i in range(0,len(sentence)):
        try:
            indices[i] = wordsList.index(sentence[i])
        except:
            indices[i] = 0
    return(indices)

In [4]:
example = np.array(['the', 'share', 'price', 'went', 'up'])
# example2 = np.array(['the', 'stock', 'market', 'was', 'rising'])
# example3 = np.array(['the', 'soccer', 'game', 'ended', 'in', 'a', 'draw'])
print(turn_sentence_to_indices(example))
# print(turn_sentence_to_indices(example2))
# print(turn_sentence_to_indices(example3))

[201534    593    626    388     60      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      

In [5]:
import tensorflow as tf
maxSeqLength = 250 #Maximum length of sentence
numDimensions = 50 #Dimensions for each word vector
example = turn_sentence_to_indices(example)

with tf.Session() as sess:
    print(tf.nn.embedding_lookup(wordVectors,example).eval())

[[ 0.41800001  0.24968    -0.41242    ..., -0.18411    -0.11514    -0.78580999]
 [ 0.39412001  0.23183     0.68751001 ...,  0.57809001  0.25825    -0.1166    ]
 [-0.44953999  0.11784     0.65070999 ...,  0.45262     0.40169001
   0.67246997]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


### Training the model requires some definitions and helper functions. These type of models are trained by creating a batch of tweets and feed it to the model and run the model a fixed number of iterations.

In [6]:
from random import randint

def getTrainBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        if (i % 2 == 0): 
            num = randint(1,11499)
            labels.append([1,0])
        else:
            num = randint(13499,24999)
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

def getTestBatch():
    labels = []
    arr = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(11499,13499)
        if (num <= 12499):
            labels.append([1,0])
        else:
            labels.append([0,1])
        arr[i] = ids[num-1:num]
    return arr, labels

### Parameters

In [7]:
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000
numDimensions = 50
maxSeqLength = 250

### Placeholders for labels and input_data

In [8]:
import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batchSize, numClasses])
input_data = tf.placeholder(tf.int32, [batchSize, maxSeqLength])

#### First define what the data looks like, its shape and then apply the vector function

In [9]:
data = tf.Variable(tf.zeros([batchSize, maxSeqLength, numDimensions]),dtype=tf.float32)
print(data)
data = tf.nn.embedding_lookup(wordVectors,input_data)
print(data)

<tf.Variable 'Variable:0' shape=(24, 250, 50) dtype=float32_ref>
Tensor("embedding_lookup:0", shape=(24, 250, 50), dtype=float32)


#### Create the LSTM cells based on the number of units we defined. Use a dropoutWrapper to throw some information away. This helps you to prevent from overfitting. You can play with the keep_prob. After that run a RNN.

In [10]:
lstmCell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)
lstmCell = tf.contrib.rnn.DropoutWrapper(cell=lstmCell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstmCell, data, dtype=tf.float32)

#### The output of the RNN is use to create the final output. It is a vector that will be multiplied by a weight matrix and some bias is added. The bias can be changed or optimized.

In [11]:
weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

#### Some metrics

In [12]:
correctPred = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correctPred, tf.float32))

#### The AdamOptimizer is another parameter that you can change to an optimizer of your own choice.

In [13]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
optimizer = tf.train.AdamOptimizer().minimize(loss)

### Tensorboard

In [14]:
import datetime

tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

In [None]:
 sess = tf.InteractiveSession()
 saver = tf.train.Saver()
 sess.run(tf.global_variables_initializer())

 for i in range(iterations):
    #Next Batch of reviews
    nextBatch, nextBatchLabels = getTrainBatch();
    sess.run(optimizer, {input_data: nextBatch, labels: nextBatchLabels})
   
    #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: nextBatch, labels: nextBatchLabels})
        writer.add_summary(summary, i)

    #Save the network every 10,000 training iterations
    if (i % 10000 == 0 and i != 0):
        save_path = saver.save(sess, "models_demo/pretrained_lstm.ckpt", global_step=i)
        print("saved to %s" % save_path)
 writer.close()

### Look at the progress on Tensorboard!

tensorboard --logdir=tensorboard

## Prediction Sample Twitter data

In [23]:
tweets = np.load('tweet_indices.npy')

In [24]:
from random import randint

def getSampleBatch():
    labels = []
    indices = np.zeros([batchSize, maxSeqLength])
    for i in range(batchSize):
        num = randint(0,99)
        labels.append([0,0])
        indices[i] = tweets[num]
    return indices, labels

#### Only 1 iteration as an example

In [25]:
iterations = 1
for i in range(iterations):
    indices, labels = getSampleBatch();
    preds = (sess.run(prediction, {input_data: indices}))

#### This is what the outcome looks like

In [29]:
preds

array([[  1.26244873e-01,   1.87414423e-01],
       [  5.54572195e-02,   1.63521454e-01],
       [  1.17820948e-01,   1.53852731e-01],
       [  7.95937106e-02,   1.68389797e-01],
       [  9.86097679e-02,   1.49592161e-01],
       [  8.80320221e-02,   1.72856599e-01],
       [ -9.98602062e-03,   1.10089496e-01],
       [ -2.63513252e-02,   1.38194278e-01],
       [  3.00383195e-02,   8.50876644e-02],
       [  5.29506132e-02,   1.05726972e-01],
       [  1.40431747e-02,   1.85552105e-01],
       [  4.85095382e-02,   1.57034233e-01],
       [  1.19612157e-01,   1.80749416e-01],
       [  1.91806033e-02,   9.29736197e-02],
       [  2.34203860e-02,   1.41861677e-01],
       [  4.81073074e-02,   1.57592028e-01],
       [  2.27466226e-05,   1.41175926e-01],
       [  1.85338631e-02,   1.46102592e-01],
       [  2.45055705e-02,   1.10755980e-01],
       [ -2.60366499e-03,   1.10157348e-01],
       [  1.29050761e-01,   1.35097295e-01],
       [  3.87021862e-02,   1.46510750e-01],
       [  

#### Convert it to a sentiment: 0 is negative and 1 is positive

In [30]:
predictions = []
for p in range(0, len(preds)):
    max_pred = max(preds[p])
    print(max_pred)
    if max_pred == preds[p][0]:
        prediction = 1
    else:
        prediction = 0
    predictions.append(prediction)
print("The predictions of this sample are:", predictions)

0.187414
0.163521
0.153853
0.16839
0.149592
0.172857
0.110089
0.138194
0.0850877
0.105727
0.185552
0.157034
0.180749
0.0929736
0.141862
0.157592
0.141176
0.146103
0.110756
0.110157
0.135097
0.146511
0.153483
0.163533
The predictions of this sample are: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
