In [0]:
from __future__ import print_function
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers import BatchNormalization
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.utils.data_utils import get_file
import numpy as np
import pandas as pd
import random
import sys
import io

In [0]:
#read the csv file into a dataframe
df = pd.read_csv('gates.csv')
df.head()


Unnamed: 0,date,username,to,replies,retweets,favorites,text,geo,mentions,hashtags,id,permalink
0,2020-04-15 05:17:26,BillGates,,67779,95069,367749,Halting funding for the World Health Organizat...,,@WHO,,1250292126643941376,https://twitter.com/BillGates/status/125029212...
1,2020-04-13 00:41:08,BillGates,NickKristof,2707,1311,8320,. @NickKristof does an amazing job capturing t...,,@NickKristof,,1249497817900433408,https://twitter.com/BillGates/status/124949781...
2,2020-04-07 20:59:04,BillGates,,5940,4845,29338,On #WorldHealthDay — and every day — I want to...,,,#WorldHealthDay,1247629990675374081,https://twitter.com/BillGates/status/124762999...
3,2020-04-05 22:11:06,BillGates,TheDailyShow,2227,1746,17991,"I always enjoy chatting with @Trevornoah, even...",,@Trevornoah,,1246923341769986051,https://twitter.com/BillGates/status/124692334...
4,2020-04-03 20:36:07,BillGates,gatesfoundation,3665,1977,11993,It’s encouraging to see the first projects beg...,,,#COVID19,1246174663778025473,https://twitter.com/BillGates/status/124617466...


In [0]:
# text = all tweets
text = df['text'].str.lower()


on #worldhealthday — and every day — i want to thank all of the health heroes for their sacrifices and for keeping us safe.pic.twitter.com/0x9qfsrlqb


In [0]:
# remove any tweets shorter than 60 characters
text = text[text.str.len()>60]

In [0]:
#create a list containing tweets - type = string
chars = sorted(list(set(''.join(text))))
print('total chars:', len(chars))
# create character dictionary
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 1
sentences = []
next_chars = []
for x in text:
    for i in range(0, len(x) - maxlen, step):
        sentences.append(x[i: i + maxlen])
        next_chars.append(x[i + maxlen])


#vectorization
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    



total chars: 75
nb sequences: 327597
Vectorization...


In [0]:
len(text)

In [0]:
#model using 3 layers as lstm - inspired by https://towardsdatascience.com/predicting-trump-tweets-with-a-rnn-95e7c398b18e
model = Sequential()

model.add(LSTM(len(chars) * 7, input_shape=(maxlen, len(chars))))
#model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(BatchNormalization())
model.add(Activation('selu'))

model.add(Dense(len(chars) * 4))
model.add(Activation('selu'))

model.add(Dense(len(chars) * 4))
model.add(BatchNormalization())
model.add(Activation('selu'))

model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(learning_rate=0.01)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['categorical_crossentropy', 'accuracy'])



In [0]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)
    
#     start_index = random.randint(0, len(text) - maxlen - 1)
    tweet = np.random.choice(text) # select random tweet
    start_index = 0
  # diversity of 0.7 provides the best results
    for diversity in [0.2, 0.5, 0.7, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = tweet[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        # reach tweet max. 
        for i in range(120):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [0]:
# model stops learning after 10 epochs
epochs = 10

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=124,
          epochs=epochs,
          callbacks=[print_callback])

Epoch 1/10

----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "i’m guessing not much tweeting has been "
i’m guessing not much tweeting has been the world the world bite and it’s and future start the poon an pover the world the unite evergate in and the world an in
----- diversity: 0.5
----- Generating with seed: "i’m guessing not much tweeting has been "
i’m guessing not much tweeting has been an ind spen an su more learter in reart the world in see work an one the way an reaks tealth pee in we see premented one
----- diversity: 1.0
----- Generating with seed: "i’m guessing not much tweeting has been "
i’m guessing not much tweeting has been tri#edlize few glaed8/fudji cverpic.twitter.com/mdivydmw77xsk8cff r1red1gd./g/g5ol718v60b4b …1 …at.es/2x3k6ee http://b-g
----- diversity: 1.2
----- Generating with seed: "i’m guessing not much tweeting has been "
i’m guessing not much tweeting has been farver hestark /vengize.com/tm33g3nzwrvwlkec2:9066l-outgr2tn

<keras.callbacks.callbacks.History at 0x7f5396a26240>

In [0]:
#sanders(260000)(len(chars)*7*4-10) = 63%
#bieber(256000)(len(chars)*7*4-10)  = 61%
#gates = 63%
#hart = 61%


In [0]:
#inspired by https://www.kaggle.com/talevy23/trump-tweet-generator-lstm-for-text-generation
def generate_tweets(sentence,diversity):
    # take the random sentence and diversity of 0.7 as input
    sentence = sentence[0:maxlen]
    #print('seed: ', sentence)
    generated = ''
    generated += sentence
    
    # reach max number of tweets
    for i in range(120):
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sentence):
            x_pred[0, t, char_indices[char]] = 1.
        #predicitions
        preds = model.predict(x_pred, verbose=0)[0]
        next_index = sample(preds, diversity)
        next_char = indices_char[next_index]
        
        generated += next_char
        sentence = sentence[1:] + next_char


    print()
    # return the generated sentence in order to be read into a text file.
    return generated

In [0]:
# generated 20000 tweets - diversity = 0.7
tweets = []
for s in random.sample(list(text), 2000):
  for diversity in [0.7]:
      tweet = generate_w_seed(s,diversity)
      tweets.append(tweet)












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































In [0]:

print(tweets)

['♛ http://shots.me/p/mwyglskz #selfiepic.twitter.com/iwez3rxt7g1p4hhdh?t= … http://instagram.com/p/feihargvlm/revix-vrelebbeht-performineorer … http://instagram.']


In [0]:
tweets

["brace yourself people....the road to make my movie people comedy shit tickets and get ur to do in unders & scited!!!!! let's gooooooooooooooooooooooim on my com",
 "i swear sept can’t get here fast enough....my late & i'm they are peoplefilm....don't get it on supposting mines & people called to do for my ass people. suppor",
 'june 20th i will be there rt @sugarbell2 comedy show in comedy is in coming my city is officially special to be catch semicrion in....lets gooooooooooooooo & go',
 "inspired by the goat/legend/mrgym himself finass....i'm cun is about to go se coming this morning....i’m about to go on saturday @icecube comment 5k level!!!! i",
 'cracking up at this video from @tommyjohnsets so our fitnes in my fitts everyone video & i can\'t wait to make this morning!!!! why is unreal "millible" & coming']

In [0]:
with open('gates_generated_tweets.txt', 'w') as f:
  for i in tweets:
    f.write(i) 
    f.write('\n')
    f.write('\n')
