In [7]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
import tensorflow as tf
# from tensorflow import set_random_seed
from numpy.random import seed

tf.random.set_seed(2)
seed(1)

Using TensorFlow backend.


In [12]:
import pandas as pd
import numpy as np
import string, os 
import json

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

# curr_dir = 'tweets/'
# all_headlines = []
# for filename in os.listdir(curr_dir):
#     if 'Articles' in filename:
#         article_df = pd.read_csv(curr_dir + filename)
#         all_headlines.extend(list(article_df.headline.values))
#         break


all_tweets = []
with open("cities/sanfrancisco", "r") as content:
  data = json.load(content)


for tweet in data:
    all_tweets.append(tweet['tweet'][0]['text'])

# TO DO, store tweets in alltweets

all_tweets = [h for h in all_tweets if h != "Unknown"]
print(all_tweets[:10])

['‘It’s gonna be May’ zoom party. @allykwilliams @emgeorge5 @jtimberlake https://t.co/0EwZTFBCMr', 'https://t.co/sPktsNCp63', 'TEST_LAT/LON: 56fd5a28-db1d-48a6-bd53-17630cc508ec', '@Comeoutsidefoo_ @OHGEESY @OHGEESY EZ', '@caspervdveen @hdevreij @koryoinleiden @NCNKorea @RepMalinowski @SpeakerPelosi @AnnLinde @ministerBlok Ist… https://t.co/ZA817O4X9Q', '@oliverbruce @lennartnout @RafaelBurde @_chloeswarbrick Cool.\nCan they go tell Georgia now?\nhttps://t.co/ZxlgJ0jd1u', "Just received a care package from my #Metamour who lives 4hrs away. It's home harvested honey, Elderberry Concentra… https://t.co/A3hcK6hNzA", 'I promise, Kirk.', '@sadisticsystems It would be really neat if you were able to signal to the compiler / runtime to switch between cal… https://t.co/rSoMjyFPUC', 'food for thought. https://t.co/CAu0gs6us0']


In [13]:
# note: clean_text removes emojis

def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
#     txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_tweets]
corpus[:20]

['‘it’s gonna be may’ zoom party allykwilliams emgeorge5 jtimberlake httpstco0ewztfbcmr',
 'httpstcospktsncp63',
 'testlatlon 56fd5a28db1d48a6bd5317630cc508ec',
 'comeoutsidefoo ohgeesy ohgeesy ez',
 'caspervdveen hdevreij koryoinleiden ncnkorea repmalinowski speakerpelosi annlinde ministerblok ist… httpstcoza817o4x9q',
 'oliverbruce lennartnout rafaelburde chloeswarbrick cool\ncan they go tell georgia now\nhttpstcozxlgj0jd1u',
 'just received a care package from my metamour who lives 4hrs away its home harvested honey elderberry concentra… httpstcoa3hck6hnza',
 'i promise kirk',
 'sadisticsystems it would be really neat if you were able to signal to the compiler  runtime to switch between cal… httpstcorsomjyfpuc',
 'food for thought httpstcocau0gs6us0',
 'ferveneppon seguro bro te escribía porque eres el único venezolanojaponés que sigo hahaha',
 'one day people will stop liking things just because they’re cute httpstcoxc1scj5a6g',
 'i’m old',
 'the winemaker of the stars opens new es

In [14]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[8153, 251],
 [8153, 251, 18],
 [8153, 251, 18, 8154],
 [8153, 251, 18, 8154, 437],
 [8153, 251, 18, 8154, 437, 394],
 [8153, 251, 18, 8154, 437, 394, 8155],
 [8153, 251, 18, 8154, 437, 394, 8155, 4873],
 [8153, 251, 18, 8154, 437, 394, 8155, 4873, 2272],
 [8153, 251, 18, 8154, 437, 394, 8155, 4873, 2272, 8156],
 [36, 8158]]

In [15]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [16]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 29, 10)            284620    
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 28462)             2874662   
Total params: 3,203,682
Trainable params: 3,203,682
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(predictors, label, epochs=20, verbose=5)

Epoch 1/20


In [2]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title().lower()

In [3]:
generate_text("dont", 8, model, max_sequence_len)

NameError: name 'model' is not defined