In [1]:
import numpy as np
import pandas as pd
import re
import random
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.models import Model 
from keras.layers import Input, Activation, Embedding, LSTM
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical


In [2]:
df = pd.read_csv("donaldtweets.csv")


In [3]:
## Convert to Lower case
text = df['Tweet_Text'].str.lower()

## Remove the URLs
text = text.map(lambda s: ' '.join([x for x in s.split() if 'http' not in x]))

## Remove short tweets
text = text[text.map(len)>40]

## Remove emojis
text = text.apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))

In [4]:
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(text)
 vocab_size = len(tokenizer.word_counts) + 1
 print("Total Vocabulary: ", vocab_size )

Total Vocabulary:  10760


In [5]:
 N = text.shape[0]
 print("Total Row Count: ", N)
 prop_train = 0.8
 train = int(N*prop_train)
 print("Training Data Count: ", train)
 test = N - train
 print("Test Data Coount: ", test)

Total Row Count:  6886
Training Data Count:  5508
Test Data Coount:  1378


In [6]:
sequences, index_train, index_test = [], [], []
count = 0
for irow,line in enumerate(text):
    #print(irow, line)
    encoded = tokenizer.texts_to_sequences([line])[0]    
    #print(encoded)
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)
        if irow < train:        
            index_train.append(count)     
        else:         
            index_test.append(count)
        count += 1
print('Total Sequences: %d' % (len(sequences)))

Total Sequences: 114825


In [7]:
from keras_preprocessing.sequence import pad_sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % (max_length))

KeyboardInterrupt: 

In [None]:
sequences = np.array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
print(y.shape)
y = to_categorical(y, num_classes=vocab_size)
X_train, y_train, X_test, y_test = X[index_train], y[index_train],X[index_test],  y[index_test]
print(X_train.shape)
print(y_train.shape)
#make it smaller for testing
X_train = X_train[:1000]
y_train = y_train[:1000]
X_test = X_test[:100]
y_test = y_test[:100]

(114825,)
(92288, 31)
(92288, 10760)


In [None]:
def build_model(vocab_size,
                  input_length=1,
                  dim_dense_embedding=10,
                  hidden_unit_LSTM=5):
    main_input = Input(shape=(input_length,),dtype='int32',name='main_input')
    embedding = Embedding(vocab_size, dim_dense_embedding, 
                          input_length=input_length)(main_input)
    x = LSTM(hidden_unit_LSTM)(embedding)
    main_output = Dense(vocab_size, activation='softmax')(x)
    model = Model(inputs=[main_input],
                   outputs=[main_output])
    return(model)

In [None]:
model = build_model(vocab_size,
                   input_length=X.shape[1],
                   dim_dense_embedding=30,
                   hidden_unit_LSTM=64)
 ##compile network
model.compile(loss='categorical_crossentropy', 
               optimizer='adam', metrics=['accuracy'])
 ##fit network
tf_model = model.fit(X_train, y_train, 
                  validation_data = (X_test,y_test),
                  epochs=20, verbose=2,
                  batch_size=128)

Epoch 1/20
8/8 - 3s - loss: 9.2798 - accuracy: 0.0150 - val_loss: 9.2755 - val_accuracy: 0.0200 - 3s/epoch - 402ms/step
Epoch 2/20
8/8 - 0s - loss: 9.2439 - accuracy: 0.0330 - val_loss: 9.2194 - val_accuracy: 0.0300 - 422ms/epoch - 53ms/step
Epoch 3/20
8/8 - 0s - loss: 8.8858 - accuracy: 0.0270 - val_loss: 8.8405 - val_accuracy: 0.0300 - 430ms/epoch - 54ms/step
Epoch 4/20
8/8 - 0s - loss: 8.0674 - accuracy: 0.0240 - val_loss: 8.4345 - val_accuracy: 0.0300 - 434ms/epoch - 54ms/step
Epoch 5/20
8/8 - 0s - loss: 7.1736 - accuracy: 0.0240 - val_loss: 8.1070 - val_accuracy: 0.0300 - 465ms/epoch - 58ms/step
Epoch 6/20
8/8 - 0s - loss: 6.3971 - accuracy: 0.0220 - val_loss: 8.0420 - val_accuracy: 0.0200 - 424ms/epoch - 53ms/step
Epoch 7/20
8/8 - 0s - loss: 5.9578 - accuracy: 0.0290 - val_loss: 8.1754 - val_accuracy: 0.0200 - 426ms/epoch - 53ms/step
Epoch 8/20
8/8 - 0s - loss: 5.7723 - accuracy: 0.0290 - val_loss: 8.3522 - val_accuracy: 0.0200 - 441ms/epoch - 55ms/step
Epoch 9/20
8/8 - 0s - loss

In [None]:
model.save_weights('model.h5')

In [None]:
model.load_weights('model.h5')

In [None]:
def generate_text():
    in_text = "America"
    for _ in range(50):
        # encode the text as integer
        enc = tokenizer.texts_to_sequences([in_text])[0]
        #print(enc)
        # pre-pad sequences to a fixed length
        enc_padding = pad_sequences([enc], maxlen=max_length-1, padding='pre')
        #print(enc_padding)
        probs = model.predict(enc_padding, verbose=0).flatten()
        #print(probs)
        index = np.random.choice(range(len(probs)),p=probs)
        #print(index)
        index_word = {v: k for k,v in tokenizer.word_index.items()}
        word = index_word[index] 
        in_text += ' ' + word
    print(in_text)

In [None]:
for _ in range(1000):
    generate_text()


America emails america come our safe in in watch mike is vote great again rallyforriley doing waiting watch that vote by new to american a all to join you to immigration elect grand vote great make going 11 watch my new passion via electionnight in selma system presidential we forever half
America to to good maga make change in here you november save now make passion voting from all win will great for the need maga for me watch support nh ht our to jones 16 hillary forgotten surreal ivankatrump for a movement america push ivoted draintheswamp we at own movement tube
America on safe monday immediately very found electionnight this maga lindseygraham very on see open of proud to the america is in back 4pm day a vote maga to american great are you of emails cont with north great nh there donaldjtrumpjr great in colorado confirms clinton let him with wattersworld
America the campaign can america together 6pm makeamericagreatagain this will me election life tonight emails across new togethe