In [1]:
from keras.callbacks import EarlyStopping
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import model_from_json
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import keras.utils as ku 
import numpy as np
from numpy.random import seed
import os
import pandas as pd
import string
import tensorflow as tf
import warnings

ModuleNotFoundError: No module named 'keras'

In [None]:
tf.random.set_seed(2) 
seed(1)

warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import json

In [None]:
with open('combined_2000.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
comms = []

for thread in data:
    for comment in thread['comments']:
        comms.append(comment['comment'])

In [None]:
from cleaner_funcs import clean_list

In [None]:
corpus = clean_list(comms)

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
from tensorflow.keras.layers import SimpleRNN

In [None]:
def create_model(max_sequence_len, total_words):    
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

In [None]:
from datetime import datetime

In [None]:
start = datetime.now()
#model.fit(predictors, label, epochs=5, verbose=5, use_multiprocessing=True)
print(datetime.now() - start)

In [None]:
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("Chance me for Penn!", 10, loaded_model, max_sequence_len))
print (generate_text("Is my SAT score high enough?", 10, loaded_model, max_sequence_len))
print (generate_text("Lorem Ipsum", 10, loaded_model, max_sequence_len))