In [1]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [2]:
from __future__ import print_function, division
from builtins import range, input

In [3]:
import os
import sys
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
#from keras.models import Model, model_from_json
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Embedding, Input, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam, SGD

Using TensorFlow backend.


In [5]:
# import keras.backend as K
# if len(K.tensorflow_backend._get_available_gpus()) > 0:
#   from keras.layers import CuDNNLSTM as LSTM
#   from keras.layers import CuDNNGRU as GRU

In [6]:
# some configuration
MAX_SEQUENCE_LENGTH = 63
MAX_VOCAB_SIZE = 4000
EMBEDDING_DIM = 50
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 128
EPOCHS = 30
LATENT_DIM = 25

In [7]:
# load in the data
input_texts = []
target_texts = []
# for line in open('/home/ubuntu/wen/Twitter/Ingested_Tweets/trump_tweets.txt'):
for line in open('/home/ubuntu/wen/Twitter/Ingested_Tweets/trump_tweets.csv/part-00000-9519e783-b055-44bf-b20c-54925873e935-c000.csv'):
  line = line.rstrip()
  if not line:
    continue

  input_line = '<sos> ' + line
  target_line = line + ' <eos>'

  input_texts.append(input_line)
  target_texts.append(target_line)


all_lines = input_texts + target_texts

In [8]:
# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
tokenizer.fit_on_texts(all_lines)
input_sequences = tokenizer.texts_to_sequences(input_texts)
target_sequences = tokenizer.texts_to_sequences(target_texts)

In [9]:
# find max seq length
max_sequence_length_from_data = max(len(s) for s in input_sequences)
print('Max sequence length:', max_sequence_length_from_data)

Max sequence length: 63


In [10]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))
assert('<sos>' in word2idx)
assert('<eos>' in word2idx)

Found 9927 unique tokens.


In [11]:
# pad sequences so that we get a N x T matrix
max_sequence_length = min(max_sequence_length_from_data, MAX_SEQUENCE_LENGTH)
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')
print('Shape of data tensor:', input_sequences.shape)

Shape of data tensor: (5221, 63)


In [12]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('/home/ubuntu/wen/NLP/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

Loading word vectors...
Found 400000 word vectors.


In [13]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

Filling pre-trained embeddings...


In [14]:
# # one-hot the targets (can't use sparse cross-entropy)
# one_hot_targets = np.zeros((len(input_sequences), max_sequence_length, num_words))
# for i, target_sequence in enumerate(target_sequences):
#   for t, word in enumerate(target_sequence):
#     if word > 0:
#       one_hot_targets[i, t, word] = 1

In [15]:
# load pre-trained word embeddings into an Embedding layer
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
#   trainable=False
)

In [16]:
# lstm = model.get_layer('lstm_1').output
lstm = LSTM(LATENT_DIM, return_sequences=True, return_state=True)
initial_h = Input(shape=(LATENT_DIM,))
initial_c = Input(shape=(LATENT_DIM,))
dense = Dense(num_words, activation='softmax')

In [17]:
model = load_model('/home/ubuntu/wen/Twitter/GetOldTweets/trump_tweets_model_full.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [18]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 63)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 63, 50)       200000      input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 25)           0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 25)           0                                            
____________________________________________________________________________________________

In [19]:
model.get_weights()

[array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.09244964, -0.5509255 , -0.70078194, ..., -1.4140141 ,
          0.20953439, -1.4629815 ],
        [ 0.17635977,  0.0689066 , -0.35633314, ..., -0.9516743 ,
         -0.145526  , -0.3243402 ],
        ...,
        [ 0.83121043,  0.06326257,  1.0084718 , ...,  1.4751939 ,
          0.5868011 ,  0.80888575],
        [-0.40411636,  0.95666665,  0.17923869, ...,  0.8277342 ,
          0.3717687 , -0.5496339 ],
        [ 0.9355164 ,  0.63911474,  0.06472743, ..., -0.6955365 ,
         -0.02749704, -2.1178122 ]], dtype=float32),
 array([[ 0.3073728 ,  0.13198818,  0.15974176, ..., -0.6183193 ,
          1.666149  ,  1.0696961 ],
        [-0.05603839,  0.30225712,  0.1414418 , ...,  0.14897741,
          0.1663367 , -0.68071973],
        [ 0.3029023 ,  0.08806106, -0.08740757, ..., -0.05381489,
          1.324471  , -0.49656278],
        ...,
        [-0.21386793, -0.3220795 ,  0.1

In [20]:
embedding_1=model.get_layer('embedding_1').get_weights()
lstm_1=model.get_layer('lstm_1').get_weights()
dense_1=model.get_layer('dense_1').get_weights()

In [21]:
# make a sampling model
input2 = Input(shape=(1,)) # we'll only input one word at a time
x = embedding_layer(input2)
x, h, c = lstm(x, initial_state=[initial_h, initial_c]) # now we need states to feed back in
output2 = dense(x)
sampling_model = Model([input2, initial_h, initial_c], [output2, h, c])

In [22]:
sampling_model.get_layer('embedding_1').set_weights(embedding_1)
sampling_model.get_layer('lstm_1').set_weights(lstm_1)
sampling_model.get_layer('dense_1').set_weights(dense_1)

In [23]:
sampling_model.get_weights()

[array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.09244964, -0.5509255 , -0.70078194, ..., -1.4140141 ,
          0.20953439, -1.4629815 ],
        [ 0.17635977,  0.0689066 , -0.35633314, ..., -0.9516743 ,
         -0.145526  , -0.3243402 ],
        ...,
        [ 0.83121043,  0.06326257,  1.0084718 , ...,  1.4751939 ,
          0.5868011 ,  0.80888575],
        [-0.40411636,  0.95666665,  0.17923869, ...,  0.8277342 ,
          0.3717687 , -0.5496339 ],
        [ 0.9355164 ,  0.63911474,  0.06472743, ..., -0.6955365 ,
         -0.02749704, -2.1178122 ]], dtype=float32),
 array([[ 0.3073728 ,  0.13198818,  0.15974176, ..., -0.6183193 ,
          1.666149  ,  1.0696961 ],
        [-0.05603839,  0.30225712,  0.1414418 , ...,  0.14897741,
          0.1663367 , -0.68071973],
        [ 0.3029023 ,  0.08806106, -0.08740757, ..., -0.05381489,
          1.324471  , -0.49656278],
        ...,
        [-0.21386793, -0.3220795 ,  0.1

In [24]:
# reverse word2idx dictionary to get back words
# during prediction
idx2word = {v:k for k, v in word2idx.items()}

In [25]:
def sample_line():
  # initial inputs
  np_input = np.array([[ word2idx['<sos>'] ]])
  h = np.zeros((1, LATENT_DIM))
  c = np.zeros((1, LATENT_DIM))

  # so we know when to quit
  eos = word2idx['<eos>']

  # store the output here
  output_sentence = []

  for _ in range(max_sequence_length):
    o, h, c = sampling_model.predict([np_input, h, c])

    # print("o.shape:", o.shape, o[0,0,:10])
    # idx = np.argmax(o[0,0])
    probs = o[0,0]
    if np.argmax(probs) == 0:
      print("wtf")
    probs[0] = 0
    probs /= probs.sum()
    idx = np.random.choice(len(probs), p=probs)
    if idx == eos:
      break

    # accuulate output
    output_sentence.append(idx2word.get(idx, '<WTF %s>' % idx))

    # make the next input into model
    np_input[0,0] = idx

  return ' '.join(output_sentence)

In [None]:
# generate a 5 line poem
while True:
  for _ in range(1):
    print(sample_line())

  ans = input("---generate another? [Y/n]---")
  if ans and ans[0].lower().startswith('n'):
    break

so everybody gain on your job and apply loyalty we are making america great country foreign reasons they are negative to negotiate great meetings with president xi of governor and they answered he is no higher deadly new two drug drug dealers back for american post russia has been agreed at the southern border flight to stop being in november the inspector general
---generate another? [Y/n]---y
to their biggest tax cuts workers and friends
---generate another? [Y/n]---y
by him take 40 representatives at a joint plants we much and from walter gillespie need working importantly the campaign or business person gave on jobs than anyone others the woodward at despite the russian now beautiful evening at 7 trillion dollars more than possible will have a very big and there asked just a many russians at all being account in we
---generate another? [Y/n]---y
and the truth see what it is probably never authorized john says tremendous supporter of something and bad and beat houston civilian taxpa