In [None]:
import numpy as np
import pandas as pd
import re
import random
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.models import Model 
from keras.layers import Input, Activation, Embedding, LSTM
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical


In [None]:
df = pd.read_csv("donaldtweets.csv")

In [None]:
## Convert to Lower case
text = df['Tweet_Text'].str.lower()

## Remove the URLs
text = text.map(lambda s: ' '.join([x for x in s.split() if 'http' not in x]))

## Remove short tweets
text = text[text.map(len)>40]

## Remove emojis
text = text.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))

In [None]:
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(text)
 vocab_size = len(tokenizer.word_counts) + 1
 print("Total Vocabulary: ", vocab_size )

In [None]:
 N = text.shape[0]
 print("Total Row Count: ", N)
 prop_train = 0.8
 train = int(N*prop_train)
 print("Training Data Count: ", train)
 test = N - train
 print("Test Data Coount: ", test)

In [None]:
sequences, index_train, index_test = [], [], []
count = 0
for irow,line in enumerate(text):
    #print(irow, line)
    encoded = tokenizer.texts_to_sequences([line])[0]    
    #print(encoded)
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        if irow < train:        
            index_train.append(count)     
        else:         
            index_test.append(count)
        count += 1
print('Total Sequences: %d' % (len(sequences)))

In [None]:
from keras_preprocessing.sequence import pad_sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % (max_length))

In [None]:
sequences = np.array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
print(y.shape)
y = to_categorical(y, num_classes=vocab_size)
X_train, y_train, X_test, y_test = X[index_train], y[index_train],X[index_test],  y[index_test]
print(X_train.shape)
print(y_train.shape)

In [None]:
def build_model(vocab_size,
                  input_length=1,
                  dim_dense_embedding=10,
                  hidden_unit_LSTM=5):
    main_input = Input(shape=(input_length,),dtype='int32',name='main_input')
    embedding = Embedding(vocab_size, dim_dense_embedding, 
                          input_length=input_length)(main_input)
    x = LSTM(hidden_unit_LSTM)(embedding)
    main_output = Dense(vocab_size, activation='softmax')(x)
    model = Model(inputs=[main_input],
                   outputs=[main_output])
    return(model)

In [None]:
model.load_weights('trump_tweets_generator_model.h5')

In [None]:
in_text = "America"
for _ in range(50):
    # encode the text as integer
    enc = tokenizer.texts_to_sequences([in_text])[0]
    #print(enc)
    # pre-pad sequences to a fixed length
    enc_padding = pad_sequences([enc], maxlen=max_length-1, padding='pre')
    #print(enc_padding)
    probs = model.predict(enc_padding, verbose=0).flatten()
    #print(probs)
    index = np.random.choice(range(len(probs)),p=probs)
    #print(index)
    index_word = {v: k for k,v in tokenizer.word_index.items()}
    word = index_word[index] 
    in_text += ' ' + word
print(in_text)