In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
import tensorflow as tf
# from tensorflow import set_random_seed
from numpy.random import seed

tf.random.set_seed(2)
seed(1)

Using TensorFlow backend.


In [4]:
import pandas as pd
import numpy as np
import string, os 
import json

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# curr_dir = 'tweets/'
# all_headlines = []
# for filename in os.listdir(curr_dir):
#     if 'Articles' in filename:
#         article_df = pd.read_csv(curr_dir + filename)
#         all_headlines.extend(list(article_df.headline.values))
#         break


all_tweets = []
with open("cities/losangeles", "r") as content:
  data = json.load(content)


for tweet in data:
    all_tweets.append(tweet['tweet'][0]['text'])

# TO DO, store tweets in alltweets

all_tweets = [h for h in all_tweets if h != "Unknown"]
print(all_tweets[:10])

['@cliterella My gay ass repositioning these lil shits under the soda machine for an hour every time the shipment comes in 😭😭😭😭', 'Got a new business partner today!  She’s none other than my client whom I helped rollover a year ago.   She’s glad… https://t.co/ZQgBSFMZlt', '@OSindemark @stringcheesey I would literally dedicate a cult to you', '😂', 'my great grandfather, so I can celebrate our birthdays together lmao', 'Free!! @MeltingPoems ~ May 1-3 https://t.co/K19ZA232ZT ~~ #amazonbestseller #meltingpoems #poems #poetry #firstbook… https://t.co/5bSQs2sX0W', 'Oh,,,,,, ANIMAL crossing,,,,, https://t.co/JWMFg6wLHp', 'If you don’t fuck with Chicano Batman n their new album don’t talk to me', '@Glock__Lesnar He really is the greatest to ever do it', '@100Glowz @mollyneiswende1 @Michaelmart11 @disneyplus Same']


In [5]:
# note: clean_text removes emojis

def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
#     txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_tweets]
corpus[:20]

['cliterella my gay ass repositioning these lil shits under the soda machine for an hour every time the shipment comes in 😭😭😭😭',
 'got a new business partner today  she’s none other than my client whom i helped rollover a year ago   she’s glad… httpstcozqgbsfmzlt',
 'osindemark stringcheesey i would literally dedicate a cult to you',
 '😂',
 'my great grandfather so i can celebrate our birthdays together lmao',
 'free meltingpoems  may 13 httpstcok19za232zt  amazonbestseller meltingpoems poems poetry firstbook… httpstco5bsqs2sx0w',
 'oh animal crossing httpstcojwmfg6wlhp',
 'if you don’t fuck with chicano batman n their new album don’t talk to me',
 'glocklesnar he really is the greatest to ever do it',
 '100glowz mollyneiswende1 michaelmart11 disneyplus same',
 'i want me something dark skin fr',
 'meandyouandeveryoneweknow and eternalsunshineofthespotlessmind made a pretty heady doublefeature for me tonight… httpstcoe1zsepasan',
 'so deeply honored to be uplifted and seen for the deep

In [8]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[23700, 7],
 [23700, 7, 1007],
 [23700, 7, 1007, 142],
 [23700, 7, 1007, 142, 23701],
 [23700, 7, 1007, 142, 23701, 91],
 [23700, 7, 1007, 142, 23701, 91, 414],
 [23700, 7, 1007, 142, 23701, 91, 414, 2424],
 [23700, 7, 1007, 142, 23701, 91, 414, 2424, 721],
 [23700, 7, 1007, 142, 23701, 91, 414, 2424, 721, 2],
 [23700, 7, 1007, 142, 23701, 91, 414, 2424, 721, 2, 5039]]

In [9]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [10]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 10)            895470    
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 89547)             9044247   
Total params: 9,984,117
Trainable params: 9,984,117
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(predictors, label, epochs=20, verbose=5)

Epoch 1/20


In [2]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title().lower()

In [3]:
generate_text("dont", 8, model, max_sequence_len)

NameError: name 'model' is not defined