In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
tf.config.experimental.list_physical_devices('GPU')

In [None]:
data = pd.read_csv("tweets_01-08-2021.csv", index_col="id", parse_dates=['date'])
data.sample(5)

In [None]:
from nltk.tokenize import RegexpTokenizer
import re

text = data.text.to_list()
processed_tweets = []
tokenizer = RegexpTokenizer('\w+|\S+')
for tweet in text:
    tweet = re.sub('(https?:[\w\/\.\d]+)|…|(^RT)|“|”|"', "", tweet)
    tweet = re.sub("&amp;?", "and", tweet)
    processed_tweets.append(tokenizer.tokenize(tweet.lower()))

In [None]:
# all tweets are of different length, with most around 20
import seaborn as sns
import matplotlib.pyplot as plt

sns.histplot([len(tweet) for tweet in processed_tweets], bins=15)
plt.xlabel("Tweet length")
plt.show()

In [None]:
# generating sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# keeping punctuation
tok = Tokenizer(filters="")
tok.fit_on_texts(processed_tweets)

sequences = []

for seq in processed_tweets:
    tokenized = tok.texts_to_sequences([seq])[0]
    for i in range(2, len(tokenized)+1):
        sequences.append(tokenized[:i])
        
total_words = len(tok.word_index) + 1

In [None]:
print(len(sequences))

In [None]:
# creating labels and padding
labels = []
for sequence in sequences:
    labels.append(sequence.pop(-1))

labels = np.array(labels)#.reshape(-1, 1)

# we need to pad the tweets of lengths shorter than 100 so they are the same length
# other strategies exist, like dropping long tweets to pad to 60
# so we save space and computation time
from tensorflow.keras.preprocessing.sequence import pad_sequences
X = np.array(pad_sequences(
    sequences, padding="post"
))
X

In [None]:
# either this, or use sparse_categorical_crossentropy
# may be try a different data type?
#label = tf.keras.utils.to_categorical(labels, num_classes=total_words, dtype='int8')

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

batch_size = 4096

model1 = keras.Sequential([
    layers.Embedding(input_dim=total_words, output_dim=256, mask_zero=True),
    layers.LSTM(units=256, dropout=0.3, stateful=False),
    layers.Dense(total_words, activation='softmax')
])

model1.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

In [None]:
model1.summary()

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath="checkpoints/model1.ckpt",
                                                 save_weights_only=True,
                                                 verbose=1)

model1.fit(X, labels, 
          epochs=30, 
          batch_size=batch_size, 
          verbose=True, 
          use_multiprocessing=True, 
          workers=8,
          callbacks=[cp_callback])

In [None]:
# get a reverse dictionary for prediction
id_to_word = {v:k for k,v in tok.word_index.items()}

In [None]:
# prediction funciton
def predict_sequence(seed, length, model):
    # how much to pad
    max_len = len(X[0])
    # final prediction sequence
    output = seed + " "
    # keep predicting until reach the length
    for i in range(length): 
        # tokenize
        tokenized = tok.texts_to_sequences([output])[0]
        # pad
        x = np.array(pad_sequences([tokenized], padding="post", maxlen=max_len))
        # predict based on current sequence
        prediction = np.argmax(model.predict(x, verbose=False), axis=-1)
        # update sequence
        output = output + id_to_word[prediction[0]] + " "
    return output
    
#predict_sequence('ivanka', 30, model1)

model1:
Let's see what mr. former president thinks about democrats:
1. democrat senators are doing a great job . i am not (LOL couldn't be more true than this)
2. democratic states , the democrats are not going to be a total disaster .

republican senators have a great job for the great state of texas . he will be a great governor ! #maga #kag and , @senatorheitkamp. and , others , the people 
gop senators must stop the flights from the united states .
obama ’s campaign is a total disaster .
biden has been a total disaster . i will be back soon ! #maga #kag #tcot @foxbusiness oh well , i ’m not going to be a total mess .

In [None]:
# model 2
from tensorflow import keras
from tensorflow.keras import layers

batch_size = 1024

model2 = keras.Sequential([
    layers.Embedding(input_dim=total_words, output_dim=256, mask_zero=True),
    layers.LSTM(units=256, return_sequences=True),
    layers.LSTM(units=256),
    layers.Dense(total_words, activation='softmax')
])

model2.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath="checkpoints2/model2.ckpt",
                                                 save_weights_only=True,
                                                 verbose=1)

history = model2.fit(X, labels, 
          epochs=20, 
          batch_size=batch_size, 
          verbose=True, 
          use_multiprocessing=True, 
          workers=8,
          callbacks=[cp_callback],
          shuffle=True)

model 2
'republican senators are working hard to get the job done in the senate . we have a great state and , great healthcare ! we need strong borders and crime ! '
'obama is a disaster for the people . he is a disaster . he is a great guy . he is a winner . he is a winner . he is a winner . he is a winner . he is a great guy and a great guy . he will be missed !
'bernie sanders is lying to the people of the united states . he is a total mess . he is a total mess . he is a total mess . he is a total mess ! he is a total mess ! he is a corrupt politician ! a total witch hunt ! no collusion , no obstruction . the dems don ’t want to do it . he is a corrupt politician ! he is a corrupt politician ! he is a corrupt politician ! he is strong on crime , borders , and , the enemy of the people ! '
'democrats stole election results . they are a disgrace to our country , and , we will win !
'gop senators are working hard on the border crisis . the dems are trying to take over the border . they are now trying to take away our laws .
'biden will bring back our country , and we are going to win the great state of texas . we need you in a second election .