In [98]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
import json

In [16]:
def get_data(path, years):
    return pd.concat([pd.read_json(path.format(year)) for year in years])     

In [17]:
TRUMP_DATA_PATH = './trump-tweet-archive/data/realdonaldtrump/{}.json'
AVAILABLE_YEARS = list(range(2009, 2018))
trump_dataframe = get_data(TRUMP_DATA_PATH, AVAILABLE_YEARS)

In [22]:
trump_dataframe.head()

Unnamed: 0,created_at,favorite_count,id_str,in_reply_to_user_id_str,is_retweet,retweet_count,source,text
0,2009-12-23 17:38:18,12,6971079756,,False,28,Twitter Web Client,From Donald Trump: Wishing everyone a wonderfu...
1,2009-12-03 19:39:09,6,6312794445,,False,33,Twitter Web Client,Trump International Tower in Chicago ranked 6t...
2,2009-11-26 19:55:38,11,6090839867,,False,13,Twitter Web Client,Wishing you and yours a very Happy and Bountif...
3,2009-11-16 21:06:10,3,5775731054,,False,5,Twitter Web Client,Donald Trump Partners with TV1 on New Reality ...
4,2009-11-02 14:57:56,6,5364614040,,False,7,Twitter Web Client,"--Work has begun, ahead of schedule, to build ..."


In [69]:
trump_text_file = '. '.join([t for t in trump_dataframe['text']])

In [72]:
trump_text_file[:1000]

'From Donald Trump: Wishing everyone a wonderful holiday & a happy, healthy, prosperous New Year. Let’s think like champions in 2010!. Trump International Tower in Chicago ranked 6th tallest building in world by Council on Tall Buildings & Urban Habitat http://bit.ly/sqvQq. Wishing you and yours a very Happy and Bountiful Thanksgiving!. Donald Trump Partners with TV1 on New Reality Series Entitled, Omarosa\'s Ultimate Merger: http://tinyurl.com/yk5m3lc. --Work has begun, ahead of schedule, to build the greatest golf course in history: Trump International – Scotland.. --From Donald Trump: "Ivanka and Jared’s wedding was spectacular, and they make a beautiful couple. I’m a very proud father.". Hear Donald Trump discuss big gov spending, banks, & taxes on Your World w/Neil Cavuto: http://tinyurl.com/yhnzd7p. Watch video of Ivanka Trump sharing business advice with 4 entrepreneurial women on GMA: http://tinyurl.com/yk6hlfo. - Read what Donald Trump has to say about daughter Ivanka\'s upcom

### Tensorflowing
* The hard part will be handling variable length input
* As for the output. we just cap it at 140 and go back to the latest dot (.)

In [287]:
n_steps = 20
n_hidden = 512
n_units = 3
embedding_size = 100
batch_size = 64
learning_rate = 0.01

In [288]:
from collections import Counter
def word_vocab(df):
    count = Counter(df.split(' ')).most_common()
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    reversed_dictionary = dict(zip(dictionary.values(), 
                               dictionary.keys()))
    vocab_size = len(dictionary)
    idx = [dictionary[w] for w in df.split(' ')]
    return dictionary, reversed_dictionary, vocab_size, idx

dictionary, reversed_dictionary, vocab_size, idx = word_vocab(trump_text_file)

In [185]:
def vocab(df):
    """
    Args:
        dataframe: Pandas dataseries with tweets
    Returns:
        w_to_id: word to id from vocab
        id_to_w: id to word from vocab
        idx: vocab in ids
    """
    vocab = set(c for t in df for c in t)
    vocab_size = len(vocab)
    w_to_id = {w: i for i, w in enumerate(vocab)}
    id_to_w = {i: w for i, w in enumerate(vocab)}
    idx = [w_to_id[c] for c in df]
    return w_to_id, id_to_w, idx, vocab_size

w_to_id, id_to_w, idx, vocab_size = vocab(trump_text_file)

In [289]:
def make_input_and_output(dataframe, n_steps=n_steps):
    """
    Args:
        dataframe: Pandas dataseries with all text
    Returns:
        c_input_transformed: Input in a numpy array format for time series
        c_target_transformed: Target character in a numpy array for time series
    """
    c_input = [[dataframe[i+j] for i in range(0, len(dataframe)-n_steps, n_steps)]
               for j in range(n_steps)]
    c_target = [dataframe[i+n_steps] for i in range(0, len(dataframe)-n_steps, n_steps)]
    c_input_transformed = np.array(np.stack(c_input, axis=1), dtype=np.int32)
    c_target_transformed = np.array(np.stack(c_target), dtype=np.int32)
    return c_input_transformed, c_target_transformed

c_inputs, c_target = make_input_and_output(idx)

In [290]:
def build_model(inputs):
    """
    Args:
        n_layers: Number of layers in the hidden layers
        n_units: Number of LSTM units
    Returns:
        Logits and probabilities
    """
    pass


def input_fn():  
    """
    Args:
        inputs: numpy array with chars
        labels: numpy array with labels
        batch_size: Size of batch
      
    Returns:
        X: size of [batch_size, sequence_length, 1]
        y: size of [batch_size, 1]        
    """
    n_examples = len(c_inputs)
    batch_idx = np.random.choice(
        n_examples,
        size=batch_size,
        replace=False)
    
    x_batch = c_inputs[batch_idx, :]
    y_batch = c_target[batch_idx]
    seq_length = np.array([len(x) for x in x_batch], dtype=np.int32)
  
    return x_batch, y_batch    

def train(n_epochs, outlook_size):
    """
    Args: 
        n_epocs: Number of epochs to run the model for
    Returns:
        probabilities: Softmax probabilities for all words
        outlook_size: Number of steps to predict out   
    """
    pass

In [291]:
#Test TF
with tf.Graph().as_default():
    with tf.name_scope("placeholders"):
        x = tf.placeholder(dtype=tf.int32, shape=[None, n_steps])
        y = tf.placeholder(dtype=tf.int32, shape=[None])
        seq_length = tf.placeholder(tf.int32, [None])
    
    # Let's set up the embedding converting words to vectors
    with tf.name_scope("embeddings"):
        embeddings = tf.Variable(tf.random_uniform(shape=[vocab_size, embedding_size], minval=-1, maxval=1))
        train_input = tf.nn.embedding_lookup(embeddings, x)
    
    with tf.name_scope("model"):
        basic_cell = tf.nn.rnn_cell.GRUCell(num_units=n_hidden)
        outputs, states = tf.nn.dynamic_rnn(basic_cell, train_input, dtype=tf.float32)

        logits = tf.layers.dense(states, units=vocab_size, activation=None)
        predictions = tf.nn.softmax(logits)
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=y,
            logits=logits)
        loss = tf.reduce_mean(xentropy)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)
        
        tf.summary.scalar('cross_entropy', loss)
        merged = tf.summary.merge_all()
        saver = tf.train.Saver()
        train_writer = tf.summary.FileWriter('./trump/loss',
                                      sess.graph)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for r in range(30):
            x_batch, y_batch = input_fn()
            feed_dict = {x: x_batch, y: y_batch}
            _, summary, loss_out = sess.run([training_op, merged, loss], feed_dict=feed_dict)
            if r % 10 == 0:
                save_path = saver.save(sess, "/tmp/trump/trump_model{}.ckpt".format(r))
                train_writer.add_summary(summary, r)
                print("loss_out", loss_out)
            
        sample_text = "From Donald Trump: Wishing everyone a wonderful holiday & a happy, healthy, prosperous New Year. Let’s think like champions in 2010!. Trump International Tower in Chicago ranked 6th tallest building in world by Council on Tall Buildings & Urban Habitat http://bit.ly/sqvQq. Wishing you and yours a very Happy Happy Happy Happy"
        
        all_predictions = []
        current_text = sample_text.split(' ')[:20]
        for i in range(50):
            sample_text_ids = np.expand_dims(np.array([dictionary[c] for c in current_text], dtype=np.int32), 0)
            prediction_out = sess.run([predictions], feed_dict={x: sample_text_ids})
            prediction_c = reversed_dictionary[np.argmax(prediction_out)]
            all_predictions.append(prediction_c)
            current_text = current_text[1:] + [prediction_c]
            
        print(all_predictions)  
        

loss_out 11.2964
loss_out 11.6237
loss_out 9.42237
['would', 'would', 'would', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'having', 'would', 'would', 'would', 'of', 'would', 'the', 'having', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'having', 'the', 'the', 'the', 'the', 'the', 'the', 'would', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the', 'the']


In [272]:
sample_text = "From Donald Trump: Wishing everyone a wonderful holiday & a happy, healthy, prosperous New Year. Let’s think like champions in 2010!. Trump International Tower in Chicago ranked 6th tallest building in world by Council on Tall Buildings & Urban Habitat http://bit.ly/sqvQq. Wishing you and yours a very Happy"
all_predictions = []
current_text = sample_text.split(' ')[:20]
sample_text_ids = np.expand_dims(np.array([dictionary[c] for c in current_text], dtype=np.int32), 0)
prediction_c = reversed_dictionary[5]
all_predictions.append(prediction_c)
c = current_text[1:] + [prediction_c]
c

['Donald',
 'Trump:',
 'Wishing',
 'everyone',
 'a',
 'wonderful',
 'holiday',
 '&',
 'a',
 'happy,',
 'healthy,',
 'prosperous',
 'New',
 'Year.',
 'Let’s',
 'think',
 'like',
 'champions',
 'in',
 'is']