### Analyzing the Twitter Data

In [1]:
# import libraries
import numpy as np
import tensorflow as tf
import matplotlib
import pandas as pd

In [2]:
# load words and vectors
words = np.load('words.npy')
words = words.tolist()
vectors = np.load('vectors.npy')

#### Load the Tweet indices stored earlier

In [3]:
tweets = np.load('tweet_indices.npy')

In [4]:
len(tweets)

1000000

#### Predict all our 1 million tweets
We will make the prediction via batches with a batch size of 64. 1e6 Tweets, means about 15.625 iterations

#### First set parameters and restore the latest results from our trained Tensorflow model

In [19]:
# hyperparameters
batch_size = 64                          # size of the batch you feed to your model every iteration
pred_iterations = 15625
max_length = 75                           # each tweet has a different length. However, RNN requires a fixed length.
output_class = 2
vector_dim = 50
lstm_units = 64

In [20]:
tf.reset_default_graph()

labels = tf.placeholder(tf.float32, [batch_size, output_class])
input_data = tf.placeholder(tf.int32, [batch_size, max_length])

data = tf.Variable(tf.zeros([batch_size, max_length, vector_dim]),dtype=tf.float32)
data = tf.nn.embedding_lookup(vectors,input_data)

lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_units)
lstm_cell = tf.contrib.rnn.DropoutWrapper(cell=lstm_cell, output_keep_prob=0.25)
value, _ = tf.nn.dynamic_rnn(lstm_cell, data, dtype=tf.float32)

weight = tf.Variable(tf.truncated_normal([lstm_units, output_class]))
bias = tf.Variable(tf.constant(0.1, shape=[output_class]))
value = tf.transpose(value, [1, 0, 2])
last = tf.gather(value, int(value.get_shape()[0]) - 1)
prediction = (tf.matmul(last, weight) + bias)

sess = tf.InteractiveSession()
saver = tf.train.Saver()
saver.restore(sess, tf.train.latest_checkpoint('models/adam_bs64_lstm64_subset'))

INFO:tensorflow:Restoring parameters from models/adam_bs64_lstm64_subset/trained_adam_bs64_lstm64_subset.ckpt-90000


#### Define batch function

In [21]:
def TweetBatch(pred_iteration):
    labels = []
    indices = np.zeros([batch_size, max_length])
    for i in range(batch_size):
        j = pred_iteration * batch_size
        k = i + j
        labels.append([k,0])
        indices[i] = tweets[k]
    return indices, labels

#### Run the trained model:

In [22]:
# Run and store the predicitions
preds_total_set = []
for p in range(pred_iterations):
    tweet, labels = TweetBatch(p)
    preds_batch = (sess.run(prediction, {input_data: tweet}))
    preds_total_set.append(preds_batch)

It is stored in a numpy array per iteration. We convert it to one array

In [23]:
predictions_twitter = preds_total_set[0]
for i in range(1,len(preds_total_set)):
    prediction_batch = preds_total_set[i]
    predictions_twitter = np.concatenate((predictions_twitter, prediction_batch), axis=0)

In [25]:
# Save the predictions
np.save("predictions_twitter", predictions_twitter)

#### Calculate the sentiment of each tweet

In [27]:
sentiment = []
for p in range(0, len(predictions_twitter)):
    max_pred = max(predictions_twitter[p])
    if max_pred == predictions_twitter[p][0]:
        prediction = 1
    else:
        prediction = 0
    sentiment.append(prediction)
# print("The predictions of this sample are:", predictions)
print("The sentiment of the first 10 tweets is:", sentiment[0:10])

The sentiment of the first 10 tweets is: [0, 1, 1, 1, 0, 1, 1, 0, 1, 1]


#### Combine the list of sentiments with the dates

In [30]:
# load df with Twitter data
twitter_df = pd.read_pickle("twitter_data")

In [31]:
twitter_df.head()

Unnamed: 0,dates,timestamp,tweet
0,2018-01-22,2018-01-22T09:31:41.357Z,"""Cryptocurrency Exchange OKCoin To Launch In S..."
1,2018-01-22,2018-01-22T09:31:41.363Z,"""RT @timothychou: Can an Algorithm Tell When K..."
2,2018-01-22,2018-01-22T09:31:41.855Z,"""RT @simdaq: Let's chat. Join us on Telegram a..."
3,2018-01-22,2018-01-22T09:31:42.362Z,"""RT @mas_oyama_coin: Masutatsu Oyama crypto cu..."
4,2018-01-22,2018-01-22T09:30:59.854Z,"""U.S. Rating Agency to Issue Bitcoin and Crypt..."


In [33]:
dates = twitter_df['dates'][0:len(sentiment)].values

In [34]:
sentiment_per_day = pd.DataFrame({'dates': dates,
                                'sentiment': sentiment})

In [36]:
sentiment_per_day.head()

Unnamed: 0,dates,sentiment
0,2018-01-22,0
1,2018-01-22,1
2,2018-01-22,1
3,2018-01-22,1
4,2018-01-22,0


In [37]:
# save the dataframe
sentiment_per_day.to_pickle("sentiment_per_day")