### Model training

In [1]:
# import libraries
import numpy as np
import tensorflow as tf
import datetime

In [2]:
# check working dir
import os
print(os.getcwd())

/Users/olafdeleeuw/Desktop/ODSC/Project/ODSC-London-2018/notebooks


In [3]:
# load words and vectors
words = np.load('words.npy')
words = words.tolist()
vectors = np.load('vectors.npy')

#### Train and testdata
split the set randomly in a train and test set

In [4]:
# load positive and negative word indices
data_pos = np.load("indices_pos_wl2.npy")
data_neg = np.load("indices_neg_wl2.npy")

In [5]:
# first shuffle the sets to get a real random split
# set seed
np.random.seed(0)

np.random.shuffle(data_pos)
np.random.shuffle(data_neg)

traindata_pos, testdata_pos = data_pos[:int(0.8 * len(data_pos)), :], data_pos[int(0.8 * len(data_pos)):, :]
traindata_neg, testdata_neg = data_neg[:int(0.8 * len(data_neg)), :], data_neg[int(0.8 * len(data_neg)):, :]

In [6]:
print('the shape of train data positive is: ' + str(traindata_pos.shape))
print('the shape of train data negative is: ' + str(traindata_neg.shape))
print('the shape of train data positive is: ' + str(testdata_pos.shape))
print('the shape of train data negative is: ' + str(testdata_neg.shape))

the shape of train data positive is: (640000, 75)
the shape of train data negative is: (640000, 75)
the shape of train data positive is: (160000, 75)
the shape of train data negative is: (160000, 75)


#### Example

In [7]:
def turn_sentence_to_indices(sentence):
    indices = np.zeros(75, dtype='int32')  # length 250 because the Tensorflow model needs a fixed dimension
    for i in range(0,len(sentence)):
        try:
            indices[i] = words.index(sentence[i])
        except:
            indices[i] = 0
    return(indices)

In [8]:
example = np.array(['the', 'share', 'price', 'went', 'up'])
# example2 = np.array(['the', 'stock', 'market', 'was', 'rising'])
# example3 = np.array(['the', 'soccer', 'game', 'ended', 'in', 'a', 'draw'])
print(turn_sentence_to_indices(example))
# print(turn_sentence_to_indices(example2))
# print(turn_sentence_to_indices(example3))

[  1 594 627 389  61   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0]


In [9]:
example = turn_sentence_to_indices(example)

with tf.Session() as sess:
    print(tf.nn.embedding_lookup(vectors,example).eval())

[[ 0.41800001  0.24968    -0.41242    ..., -0.18411    -0.11514    -0.78580999]
 [ 0.39412001  0.23183     0.68751001 ...,  0.57809001  0.25825    -0.1166    ]
 [-0.44953999  0.11784     0.65070999 ...,  0.45262     0.40169001
   0.67246997]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]


#### Helper functions

A function to create train and test batches of an equal number of positive and negative tweets

In [12]:
# get subset
train_sub_pos = traindata_pos[np.random.choice(traindata_pos.shape[0], 25000, replace=False)]
train_sub_neg = traindata_neg[np.random.choice(traindata_neg.shape[0], 25000, replace=False)]
test_sub_pos = testdata_pos[np.random.choice(testdata_pos.shape[0], 25000, replace=False)]
test_sub_neg = testdata_neg[np.random.choice(testdata_neg.shape[0], 25000, replace=False)]

In [23]:
train_sub_neg.shape

(25000, 75)

In [14]:
from random import randint

def CreateBatch(dataset, batch_size, max_length):
    labels = []
    vector = np.zeros([batch_size, max_length])
    
    if dataset == 'train':
        positive_tweets = train_sub_pos
        negative_tweets = train_sub_neg
    elif dataset == 'test':
        positive_tweets = test_sub_pos
        negative_tweets = test_sub_neg
    else:
        pass
    
    for i in range(batch_size):
        if (i % 2 == 0): 
            num = randint(0,len(positive_tweets)-1)
            labels.append([1,0])
            vector[i] = positive_tweets[num]
        else:
            num = randint(0,len(negative_tweets)-1)
            labels.append([0,1])
            vector[i] = negative_tweets[num]
    return vector, labels

#### Hyperparameters
The performance of the recurrent neural network with LSTM depends on many paramaters. You have to choose, for example, the number of LSTM untis in your network, the size of the batches you feed to you model every iteration and the optimizer function.

In [15]:
# hyperparameters
batch_size = 64                          # size of the batch you feed to your model every iteration
lstm_units = 64                           # number of LSTM units
output_class = 2                          # possible outcomes (positive or negative)
iterations = 100000
max_length = 75                           # each tweet has a different length. However, RNN requires a fixed length.
vector_dim = 50                           # dimension of the word vector
optimizer_algorithm = 'Adam'   # possibilities are for example Adam, StochasticGradientDescent
# learning_rate = 0.001

#### Tensorflow model setup

In [16]:
#import tensorflow as tf
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batch_size, output_class])
input_data = tf.placeholder(tf.int32, [batch_size, max_length])

#### First define what the data looks like, its shape and then apply the vector function

In [17]:
data = tf.Variable(tf.zeros([batch_size, max_length, vector_dim]),dtype=tf.float32)
print(data)
data = tf.nn.embedding_lookup(vectors,input_data)
print(data)

<tf.Variable 'Variable:0' shape=(64, 75, 50) dtype=float32_ref>
Tensor("embedding_lookup:0", shape=(64, 75, 50), dtype=float32)


#### LSTM Cells
Create the LSTM cells based on the number of units we defined. Use a dropoutWrapper to throw some information away. This helps you to prevent from overfitting. You can play with the keep_prob. After that run a RNN.

In [18]:
lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
lstm_cell = tf.contrib.rnn.DropoutWrapper(cell=lstm_cell, output_keep_prob=0.75)
value, _ = tf.nn.dynamic_rnn(lstm_cell, data, dtype=tf.float32)

#### Weigth and bias
The output of the RNN is use to create the final output. It is a vector that will be multiplied by a weight matrix and some bias is added. The bias can be changed or optimized.

In [19]:
# define intitial weight and bias
weight = tf.Variable(tf.truncated_normal([lstm_units, output_class]))
bias = tf.Variable(tf.constant(0.1, shape=[output_class]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)  # get the correct value from the output
prediction = (tf.matmul(last, weight) + bias)  # calculate the prediction, matmul is just a MATrix MULtiplication.

#### Accuracy

In [20]:
correct_predictions = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))

#### Optimizer

In [21]:
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))

# Optimizer based on the choice of the hyperparameters
if optimizer_algorithm == 'Adam':
    optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(loss)
elif optimizer_algorithm == 'GradientDescent':
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)

# optimizer = tf.train.AdamOptimizer(learning_rate = 0.001).minimize(loss)

NameError: name 'learning_rate' is not defined

In [31]:
optimizer

<tf.Operation 'Adam' type=NoOp>

#### Tensorboard
While training your model you can view the progress, including the accuracy and loss, on Tensorboard. Simply navigate to the folder where your notebook is running and run the command *tensorboard --logdir=tensorboard*

In [32]:
# define the variables you want to show on Tensorboard
tf.summary.scalar('Loss', loss)
tf.summary.scalar('Accuracy', accuracy)
merged = tf.summary.merge_all()
logdir = "tensorboard/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
writer = tf.summary.FileWriter(logdir, sess.graph)

### Training!

In [33]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
sess.run(tf.global_variables_initializer())

for i in range(iterations):
    #Next Batch of reviews
    batch_data, batch_labels = CreateBatch('train', batch_size, max_length);
    sess.run(optimizer, {input_data: batch_data, labels: batch_labels})
   
    #Write summary to Tensorboard
    if (i % 50 == 0):
        summary = sess.run(merged, {input_data: batch_data, labels: batch_labels})
        writer.add_summary(summary, i)

    #Save the network every 10,000 training iterations
    if (i % 10000 == 0 and i != 0):
        save_path = saver.save(sess, "models/adam_bs64_lstm64_subset_v2/trained_adam_bs64_lstm64_subset_v2.ckpt", global_step=i)
        print("saved to %s" % save_path)
writer.close()

saved to models/adam_bs64_lstm64_subset_v2/trained_adam_bs64_lstm64_subset_v2.ckpt-10000
saved to models/adam_bs64_lstm64_subset_v2/trained_adam_bs64_lstm64_subset_v2.ckpt-20000


KeyboardInterrupt: 

##### Now it's running! Follow the progress on Tensorboard. Run in te command line 'tensorboard --logdir=tensorboard' and go to localhost:6006

#### Restore the train session

In [22]:
sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models/adam_bs64_lstm64_subset'))

INFO:tensorflow:Restoring parameters from models/adam_bs64_lstm64_subset/trained_adam_bs64_lstm64_subset.ckpt-90000


#### Apply to test set

In [None]:
iterations = 10000
accuracy = []
for i in range(iterations):
    batch_data, batch_labels = CreateBatch('test', batch_size, max_length);
    accuracy.append((sess.run(accuracy, {input_data: batch_data, labels: batch_labels})) * 100)