In [2]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
import tensorflow as tf
# from tensorflow import set_random_seed
from numpy.random import seed

tf.random.set_seed(2)
seed(1)

Using TensorFlow backend.


In [3]:
import pandas as pd
import numpy as np
import string, os 
import json

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# curr_dir = 'tweets/'
# all_headlines = []
# for filename in os.listdir(curr_dir):
#     if 'Articles' in filename:
#         article_df = pd.read_csv(curr_dir + filename)
#         all_headlines.extend(list(article_df.headline.values))
#         break


all_tweets = []
with open("cities/fremont", "r") as content:
  data = json.load(content)


for tweet in data:
    all_tweets.append(tweet['tweet'][0]['text'])

# TO DO, store tweets in alltweets

all_tweets = [h for h in all_tweets if h != "Unknown"]
print(all_tweets[:10])

['Thank god for giving me the grace &amp; strength to finish all my final essays in 5 days 🙏🏽🙏🏽', '[16:53:30] 184.105.139.82:58639 &gt;&gt; :123 (UDP)', '@iqra_rafeeq اقرا بیبی لگتا ہے تم اپنے پچھلے جنم میں ایک مانا ہوا جاہل مُلا تھیں۔ alcohol پینا حرام ہے۔ اسکو disinf… https://t.co/UVkPr9NczR', '[17:13:36] 184.105.139.121:56259 &gt;&gt; :19 (UDP)', '@PaprikaPink @tweetbrk Me too. *sniff*', 'Your “friends” really aren’t your friends if they don’t tell you that your foundation is too light', 'when a mf twitches in his sleep, it’s the lies tryna comeout - alejandra', '@Glukoska Информацию надо дозировать!', '#COVID @MichelleObama @ObamaFoundation @ObamaMalik https://t.co/iUe77Q2eez', 'you’ve got desires, I know ✨ https://t.co/Ugax9hZPwp']


In [6]:
# note: clean_text removes emojis

def clean_text(txt):
#     txt = "".join(v for v in txt if v not in string.punctuation).lower()
#     txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_tweets]
corpus[:30]

['Thank god for giving me the grace &amp; strength to finish all my final essays in 5 days 🙏🏽🙏🏽',
 '[16:53:30] 184.105.139.82:58639 &gt;&gt; :123 (UDP)',
 '@iqra_rafeeq اقرا بیبی لگتا ہے تم اپنے پچھلے جنم میں ایک مانا ہوا جاہل مُلا تھیں۔ alcohol پینا حرام ہے۔ اسکو disinf… https://t.co/UVkPr9NczR',
 '[17:13:36] 184.105.139.121:56259 &gt;&gt; :19 (UDP)',
 '@PaprikaPink @tweetbrk Me too. *sniff*',
 'Your “friends” really aren’t your friends if they don’t tell you that your foundation is too light',
 'when a mf twitches in his sleep, it’s the lies tryna comeout - alejandra',
 '@Glukoska Информацию надо дозировать!',
 '#COVID @MichelleObama @ObamaFoundation @ObamaMalik https://t.co/iUe77Q2eez',
 'you’ve got desires, I know ✨ https://t.co/Ugax9hZPwp',
 'I only wear makeup every three weeks so y’all gone have to keep seeing my pretty face in that same fit 😂',
 'We should be in LA rn going up for Rucc Dawg’s birthday 🥺',
 'Cactus https://t.co/PK5BWiGDb5',
 'Ducks https://t.co/2rE5gLk5V1',
 'ch

In [7]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[80, 168],
 [80, 168, 16],
 [80, 168, 16, 237],
 [80, 168, 16, 237, 25],
 [80, 168, 16, 237, 25, 5],
 [80, 168, 16, 237, 25, 5, 1237],
 [80, 168, 16, 237, 25, 5, 1237, 93],
 [80, 168, 16, 237, 25, 5, 1237, 93, 1238],
 [80, 168, 16, 237, 25, 5, 1237, 93, 1238, 6],
 [80, 168, 16, 237, 25, 5, 1237, 93, 1238, 6, 1239]]

In [8]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [38]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 29, 10)            45040     
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 4504)              454904    
Total params: 544,344
Trainable params: 544,344
Non-trainable params: 0
_________________________________________________________________


In [10]:

model.fit(predictors, label, epochs=50, verbose=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.callbacks.History at 0x63541b9d0>

In [11]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title().lower()

In [33]:
generate_text("we", 10, model, max_sequence_len)

'we just realized i thought i don’t god i don’t a'