In [8]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
import tensorflow as tf
# from tensorflow import set_random_seed
from numpy.random import seed

tf.random.set_seed(2)
seed(1)

In [51]:
import pandas as pd
import numpy as np
import string, os 
import json

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# curr_dir = 'tweets/'
# all_headlines = []
# for filename in os.listdir(curr_dir):
#     if 'Articles' in filename:
#         article_df = pd.read_csv(curr_dir + filename)
#         all_headlines.extend(list(article_df.headline.values))
#         break


all_tweets = []
with open("tweets/t2-50.txt", "r") as content:
  data = json.load(content)


for tweet in data:
    all_tweets.append(tweet['tweet'][0]['text'])

# TO DO, store tweets in alltweets

all_tweets = [h for h in all_tweets if h != "Unknown"]
print(all_tweets[:10])

['This new Marca MP album is the one 🔥', '@ariannaochoa__ accurate 😂', '@De_Majorman @broooookieeeeee @pulte @paige_corley Did you receive anything yet? Pls lmk 🙏🏾', '😂😂😂😂😂😂', 'We’re back tomorrow at Bel Air!! 😆Our first stop of this month. Miss our food come to see us tomorrow at 📍100 Bel A… https://t.co/c7xxAg3HaN', '*my mom', '@Birdyword @codinghorror Agreed. It’s nearly inedible.', '@mamianuh I think just a bakery but they might have other stuff, @tjmotuga1 what’s your fams spot in Riverside?', 'Calling all @SFSU #Students #alumni -We need you to let future #Gators know why they should come to #sfsu-\nEncourag… https://t.co/XC5fbulvnn', '@LA_ALEX4 @Ar_8choa Bro come on rings 🙄😂😂']


In [93]:
# note: clean_text removes emojis

def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
#     txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_tweets]
corpus[:20]

['this new marca mp album is the one 🔥',
 'ariannaochoa accurate 😂',
 'demajorman broooookieeeeee pulte paigecorley did you receive anything yet pls lmk 🙏🏾',
 '😂😂😂😂😂😂',
 'we’re back tomorrow at bel air 😆our first stop of this month miss our food come to see us tomorrow at 📍100 bel a… httpstcoc7xxag3han',
 'my mom',
 'birdyword codinghorror agreed it’s nearly inedible',
 'mamianuh i think just a bakery but they might have other stuff tjmotuga1 what’s your fams spot in riverside',
 'calling all sfsu students alumni we need you to let future gators know why they should come to sfsu\nencourag… httpstcoxc5fbulvnn',
 'laalex4 ar8choa bro come on rings 🙄😂😂',
 'psychhype thekjohnston httpstco2msdsuwqfw',
 'happy birthday ladies and many more',
 'romankinggg this you 😂',
 'smdailypress last year the selfappointed city council in sm passed the “anti mansion” laws significantly reducin… httpstcoqd4pfinjtk',
 'ivanoozee show me these star projectors i have one too but it’s not fancy fancy',
 '👍👍👍👍

In [94]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[11, 102],
 [11, 102, 3685],
 [11, 102, 3685, 3686],
 [11, 102, 3685, 3686, 941],
 [11, 102, 3685, 3686, 941, 8],
 [11, 102, 3685, 3686, 941, 8, 1],
 [11, 102, 3685, 3686, 941, 8, 1, 43],
 [11, 102, 3685, 3686, 941, 8, 1, 43, 388],
 [3687, 2094],
 [3687, 2094, 86]]

In [95]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [96]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 10)            141770    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 14177)             1431877   
Total params: 1,618,047
Trainable params: 1,618,047
Non-trainable params: 0
_________________________________________________________________


In [126]:
model.fit(predictors, label, epochs=20, verbose=5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x642598450>

In [142]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title().lower()

In [161]:
generate_text("dont", 8, model, max_sequence_len)

'dont do you httpstcog6iv2wx1f1 anyone for this of sanditonpbs'